youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 from urlparse import parse_qs
  17
  18 try:
  19         import cStringIO as StringIO
  20 except ImportError:
  21         import StringIO
  22
  23 from utils import *
  24
  25
  26 class InfoExtractor(object):
  27         """Information Extractor class.
  28
  29         Information extractors are the classes that, given a URL, extract
  30         information from the video (or videos) the URL refers to. This
  31         information includes the real video URL, the video title and simplified
  32         title, author and others. The information is stored in a dictionary
  33         which is then passed to the FileDownloader. The FileDownloader
  34         processes this information possibly downloading the video to the file
  35         system, among other possible outcomes. The dictionaries must include
  36         the following fields:
  37
  38         id:             Video identifier.
  39         url:            Final video URL.
  40         uploader:       Nickname of the video uploader.
  41         title:          Literal title.
  42         ext:            Video filename extension.
  43         format:         Video format.
  44         player_url:     SWF Player URL (may be None).
  45
  46         The following fields are optional. Their primary purpose is to allow
  47         youtube-dl to serve as the backend for a video search function, such
  48         as the one in youtube2mp3.  They are only used when their respective
  49         forced printing functions are called:
  50
  51         thumbnail:      Full URL to a video thumbnail image.
  52         description:    One-line video description.
  53
  54         Subclasses of this one should re-define the _real_initialize() and
  55         _real_extract() methods and define a _VALID_URL regexp.
  56         Probably, they should also be added to the list of extractors.
  57         """
  58
  59         _ready = False
  60         _downloader = None
  61
  62         def __init__(self, downloader=None):
  63                 """Constructor. Receives an optional downloader."""
  64                 self._ready = False
  65                 self.set_downloader(downloader)
  66
  67         def suitable(self, url):
  68                 """Receives a URL and returns True if suitable for this IE."""
  69                 return re.match(self._VALID_URL, url) is not None
  70
  71         def initialize(self):
  72                 """Initializes an instance (authentication, etc)."""
  73                 if not self._ready:
  74                         self._real_initialize()
  75                         self._ready = True
  76
  77         def extract(self, url):
  78                 """Extracts URL information and returns it in list of dicts."""
  79                 self.initialize()
  80                 return self._real_extract(url)
  81
  82         def set_downloader(self, downloader):
  83                 """Sets the downloader for this IE."""
  84                 self._downloader = downloader
  85
  86         def _real_initialize(self):
  87                 """Real initialization process. Redefine in subclasses."""
  88                 pass
  89
  90         def _real_extract(self, url):
  91                 """Real extraction process. Redefine in subclasses."""
  92                 pass
  93
  94
  95 class YoutubeIE(InfoExtractor):
  96         """Information extractor for youtube.com."""
  97
  98         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
  99         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 100         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 101         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 102         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 103         _NETRC_MACHINE = 'youtube'
 104         # Listed in order of quality
 105         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 106         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 107         _video_extensions = {
 108                 '13': '3gp',
 109                 '17': 'mp4',
 110                 '18': 'mp4',
 111                 '22': 'mp4',
 112                 '37': 'mp4',
 113                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 114                 '43': 'webm',
 115                 '44': 'webm',
 116                 '45': 'webm',
 117                 '46': 'webm',
 118         }
 119         _video_dimensions = {
 120                 '5': '240x400',
 121                 '6': '???',
 122                 '13': '???',
 123                 '17': '144x176',
 124                 '18': '360x640',
 125                 '22': '720x1280',
 126                 '34': '360x640',
 127                 '35': '480x854',
 128                 '37': '1080x1920',
 129                 '38': '3072x4096',
 130                 '43': '360x640',
 131                 '44': '480x854',
 132                 '45': '720x1280',
 133                 '46': '1080x1920',
 134         }
 135         IE_NAME = u'youtube'
 136
 137         def report_lang(self):
 138                 """Report attempt to set language."""
 139                 self._downloader.to_screen(u'[youtube] Setting language')
 140
 141         def report_login(self):
 142                 """Report attempt to log in."""
 143                 self._downloader.to_screen(u'[youtube] Logging in')
 144
 145         def report_age_confirmation(self):
 146                 """Report attempt to confirm age."""
 147                 self._downloader.to_screen(u'[youtube] Confirming age')
 148
 149         def report_video_webpage_download(self, video_id):
 150                 """Report attempt to download video webpage."""
 151                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 152
 153         def report_video_info_webpage_download(self, video_id):
 154                 """Report attempt to download video info webpage."""
 155                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 156
 157         def report_video_subtitles_download(self, video_id):
 158                 """Report attempt to download video info webpage."""
 159                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 160
 161         def report_information_extraction(self, video_id):
 162                 """Report attempt to extract video information."""
 163                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 164
 165         def report_unavailable_format(self, video_id, format):
 166                 """Report extracted video URL."""
 167                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 168
 169         def report_rtmp_download(self):
 170                 """Indicate the download will use the RTMP protocol."""
 171                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 172
 173         def _closed_captions_xml_to_srt(self, xml_string):
 174                 srt = ''
 175                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 176                 # TODO parse xml instead of regex
 177                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 178                         if not dur: dur = '4'
 179                         start = float(start)
 180                         end = start + float(dur)
 181                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 182                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 183                         caption = unescapeHTML(caption)
 184                         caption = unescapeHTML(caption) # double cycle, intentional
 185                         srt += str(n) + '\n'
 186                         srt += start + ' --> ' + end + '\n'
 187                         srt += caption + '\n\n'
 188                 return srt
 189
 190         def _print_formats(self, formats):
 191                 print 'Available formats:'
 192                 for x in formats:
 193                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 194
 195         def _real_initialize(self):
 196                 if self._downloader is None:
 197                         return
 198
 199                 username = None
 200                 password = None
 201                 downloader_params = self._downloader.params
 202
 203                 # Attempt to use provided username and password or .netrc data
 204                 if downloader_params.get('username', None) is not None:
 205                         username = downloader_params['username']
 206                         password = downloader_params['password']
 207                 elif downloader_params.get('usenetrc', False):
 208                         try:
 209                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 210                                 if info is not None:
 211                                         username = info[0]
 212                                         password = info[2]
 213                                 else:
 214                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 215                         except (IOError, netrc.NetrcParseError), err:
 216                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 217                                 return
 218
 219                 # Set language
 220                 request = urllib2.Request(self._LANG_URL)
 221                 try:
 222                         self.report_lang()
 223                         urllib2.urlopen(request).read()
 224                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 225                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 226                         return
 227
 228                 # No authentication to be performed
 229                 if username is None:
 230                         return
 231
 232                 # Log in
 233                 login_form = {
 234                                 'current_form': 'loginForm',
 235                                 'next':         '/',
 236                                 'action_login': 'Log In',
 237                                 'username':     username,
 238                                 'password':     password,
 239                                 }
 240                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 241                 try:
 242                         self.report_login()
 243                         login_results = urllib2.urlopen(request).read()
 244                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 245                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 246                                 return
 247                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 248                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 249                         return
 250
 251                 # Confirm age
 252                 age_form = {
 253                                 'next_url':             '/',
 254                                 'action_confirm':       'Confirm',
 255                                 }
 256                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 257                 try:
 258                         self.report_age_confirmation()
 259                         age_results = urllib2.urlopen(request).read()
 260                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 261                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 262                         return
 263
 264         def _real_extract(self, url):
 265                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 266                 mobj = re.search(self._NEXT_URL_RE, url)
 267                 if mobj:
 268                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 269
 270                 # Extract video id from URL
 271                 mobj = re.match(self._VALID_URL, url)
 272                 if mobj is None:
 273                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 274                         return
 275                 video_id = mobj.group(2)
 276
 277                 # Get video webpage
 278                 self.report_video_webpage_download(video_id)
 279                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 280                 try:
 281                         video_webpage = urllib2.urlopen(request).read()
 282                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 283                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 284                         return
 285
 286                 # Attempt to extract SWF player URL
 287                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 288                 if mobj is not None:
 289                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 290                 else:
 291                         player_url = None
 292
 293                 # Get video info
 294                 self.report_video_info_webpage_download(video_id)
 295                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 296                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 297                                         % (video_id, el_type))
 298                         request = urllib2.Request(video_info_url)
 299                         try:
 300                                 video_info_webpage = urllib2.urlopen(request).read()
 301                                 video_info = parse_qs(video_info_webpage)
 302                                 if 'token' in video_info:
 303                                         break
 304                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 305                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 306                                 return
 307                 if 'token' not in video_info:
 308                         if 'reason' in video_info:
 309                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 310                         else:
 311                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 312                         return
 313
 314                 # Start extracting information
 315                 self.report_information_extraction(video_id)
 316
 317                 # uploader
 318                 if 'author' not in video_info:
 319                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 320                         return
 321                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 322
 323                 # title
 324                 if 'title' not in video_info:
 325                         self._downloader.trouble(u'ERROR: unable to extract video title')
 326                         return
 327                 video_title = urllib.unquote_plus(video_info['title'][0])
 328                 video_title = video_title.decode('utf-8')
 329
 330                 # thumbnail image
 331                 if 'thumbnail_url' not in video_info:
 332                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 333                         video_thumbnail = ''
 334                 else:   # don't panic if we can't find it
 335                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 336
 337                 # upload date
 338                 upload_date = u'NA'
 339                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 340                 if mobj is not None:
 341                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 342                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 343                         for expression in format_expressions:
 344                                 try:
 345                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 346                                 except:
 347                                         pass
 348
 349                 # description
 350                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 351                 if video_description: video_description = clean_html(video_description)
 352                 else: video_description = ''
 353
 354                 # closed captions
 355                 video_subtitles = None
 356                 if self._downloader.params.get('writesubtitles', False):
 357                         try:
 358                                 self.report_video_subtitles_download(video_id)
 359                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 360                                 try:
 361                                         srt_list = urllib2.urlopen(request).read()
 362                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 363                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 364                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
 365                                 if not srt_lang_list:
 366                                         raise Trouble(u'WARNING: video has no closed captions')
 367                                 if self._downloader.params.get('subtitleslang', False):
 368                                         srt_lang = self._downloader.params.get('subtitleslang')
 369                                 elif 'en' in srt_lang_list:
 370                                         srt_lang = 'en'
 371                                 else:
 372                                         srt_lang = srt_lang_list[0]
 373                                 if not srt_lang in srt_lang_list:
 374                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 375                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
 376                                 try:
 377                                         srt_xml = urllib2.urlopen(request).read()
 378                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 379                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 380                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 381                         except Trouble as trouble:
 382                                 self._downloader.trouble(trouble[0])
 383
 384                 # token
 385                 video_token = urllib.unquote_plus(video_info['token'][0])
 386
 387                 # Decide which formats to download
 388                 req_format = self._downloader.params.get('format', None)
 389
 390                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 391                         self.report_rtmp_download()
 392                         video_url_list = [(None, video_info['conn'][0])]
 393                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 394                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 395                         url_data = [parse_qs(uds) for uds in url_data_strs]
 396                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 397                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
 398
 399                         format_limit = self._downloader.params.get('format_limit', None)
 400                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 401                         if format_limit is not None and format_limit in available_formats:
 402                                 format_list = available_formats[available_formats.index(format_limit):]
 403                         else:
 404                                 format_list = available_formats
 405                         existing_formats = [x for x in format_list if x in url_map]
 406                         if len(existing_formats) == 0:
 407                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 408                                 return
 409                         if self._downloader.params.get('listformats', None):
 410                                 self._print_formats(existing_formats)
 411                                 return
 412                         if req_format is None or req_format == 'best':
 413                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 414                         elif req_format == 'worst':
 415                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 416                         elif req_format in ('-1', 'all'):
 417                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 418                         else:
 419                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 420                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 421                                 req_formats = req_format.split('/')
 422                                 video_url_list = None
 423                                 for rf in req_formats:
 424                                         if rf in url_map:
 425                                                 video_url_list = [(rf, url_map[rf])]
 426                                                 break
 427                                 if video_url_list is None:
 428                                         self._downloader.trouble(u'ERROR: requested format not available')
 429                                         return
 430                 else:
 431                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 432                         return
 433
 434                 results = []
 435                 for format_param, video_real_url in video_url_list:
 436                         # Extension
 437                         video_extension = self._video_extensions.get(format_param, 'flv')
 438
 439                         results.append({
 440                                 'id':           video_id.decode('utf-8'),
 441                                 'url':          video_real_url.decode('utf-8'),
 442                                 'uploader':     video_uploader.decode('utf-8'),
 443                                 'upload_date':  upload_date,
 444                                 'title':        video_title,
 445                                 'ext':          video_extension.decode('utf-8'),
 446                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 447                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 448                                 'description':  video_description,
 449                                 'player_url':   player_url,
 450                                 'subtitles':    video_subtitles
 451                         })
 452                 return results
 453
 454
 455 class MetacafeIE(InfoExtractor):
 456         """Information Extractor for metacafe.com."""
 457
 458         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 459         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 460         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 461         IE_NAME = u'metacafe'
 462
 463         def __init__(self, downloader=None):
 464                 InfoExtractor.__init__(self, downloader)
 465
 466         def report_disclaimer(self):
 467                 """Report disclaimer retrieval."""
 468                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 469
 470         def report_age_confirmation(self):
 471                 """Report attempt to confirm age."""
 472                 self._downloader.to_screen(u'[metacafe] Confirming age')
 473
 474         def report_download_webpage(self, video_id):
 475                 """Report webpage download."""
 476                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 477
 478         def report_extraction(self, video_id):
 479                 """Report information extraction."""
 480                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 481
 482         def _real_initialize(self):
 483                 # Retrieve disclaimer
 484                 request = urllib2.Request(self._DISCLAIMER)
 485                 try:
 486                         self.report_disclaimer()
 487                         disclaimer = urllib2.urlopen(request).read()
 488                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 489                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 490                         return
 491
 492                 # Confirm age
 493                 disclaimer_form = {
 494                         'filters': '0',
 495                         'submit': "Continue - I'm over 18",
 496                         }
 497                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 498                 try:
 499                         self.report_age_confirmation()
 500                         disclaimer = urllib2.urlopen(request).read()
 501                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 502                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 503                         return
 504
 505         def _real_extract(self, url):
 506                 # Extract id and simplified title from URL
 507                 mobj = re.match(self._VALID_URL, url)
 508                 if mobj is None:
 509                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 510                         return
 511
 512                 video_id = mobj.group(1)
 513
 514                 # Check if video comes from YouTube
 515                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 516                 if mobj2 is not None:
 517                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 518                         return
 519
 520                 # Retrieve video webpage to extract further information
 521                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 522                 try:
 523                         self.report_download_webpage(video_id)
 524                         webpage = urllib2.urlopen(request).read()
 525                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 526                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 527                         return
 528
 529                 # Extract URL, uploader and title from webpage
 530                 self.report_extraction(video_id)
 531                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 532                 if mobj is not None:
 533                         mediaURL = urllib.unquote(mobj.group(1))
 534                         video_extension = mediaURL[-3:]
 535
 536                         # Extract gdaKey if available
 537                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 538                         if mobj is None:
 539                                 video_url = mediaURL
 540                         else:
 541                                 gdaKey = mobj.group(1)
 542                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 543                 else:
 544                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 545                         if mobj is None:
 546                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 547                                 return
 548                         vardict = parse_qs(mobj.group(1))
 549                         if 'mediaData' not in vardict:
 550                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 551                                 return
 552                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 553                         if mobj is None:
 554                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 555                                 return
 556                         mediaURL = mobj.group(1).replace('\\/', '/')
 557                         video_extension = mediaURL[-3:]
 558                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 559
 560                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 561                 if mobj is None:
 562                         self._downloader.trouble(u'ERROR: unable to extract title')
 563                         return
 564                 video_title = mobj.group(1).decode('utf-8')
 565
 566                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 567                 if mobj is None:
 568                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 569                         return
 570                 video_uploader = mobj.group(1)
 571
 572                 return [{
 573                         'id':           video_id.decode('utf-8'),
 574                         'url':          video_url.decode('utf-8'),
 575                         'uploader':     video_uploader.decode('utf-8'),
 576                         'upload_date':  u'NA',
 577                         'title':        video_title,
 578                         'ext':          video_extension.decode('utf-8'),
 579                         'format':       u'NA',
 580                         'player_url':   None,
 581                 }]
 582
 583
 584 class DailymotionIE(InfoExtractor):
 585         """Information Extractor for Dailymotion"""
 586
 587         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
 588         IE_NAME = u'dailymotion'
 589
 590         def __init__(self, downloader=None):
 591                 InfoExtractor.__init__(self, downloader)
 592
 593         def report_download_webpage(self, video_id):
 594                 """Report webpage download."""
 595                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 596
 597         def report_extraction(self, video_id):
 598                 """Report information extraction."""
 599                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 600
 601         def _real_extract(self, url):
 602                 # Extract id and simplified title from URL
 603                 mobj = re.match(self._VALID_URL, url)
 604                 if mobj is None:
 605                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 606                         return
 607
 608                 video_id = mobj.group(1)
 609
 610                 video_extension = 'flv'
 611
 612                 # Retrieve video webpage to extract further information
 613                 request = urllib2.Request(url)
 614                 request.add_header('Cookie', 'family_filter=off')
 615                 try:
 616                         self.report_download_webpage(video_id)
 617                         webpage = urllib2.urlopen(request).read()
 618                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 619                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 620                         return
 621
 622                 # Extract URL, uploader and title from webpage
 623                 self.report_extraction(video_id)
 624                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
 625                 if mobj is None:
 626                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 627                         return
 628                 sequence = urllib.unquote(mobj.group(1))
 629                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
 630                 if mobj is None:
 631                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 632                         return
 633                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
 634
 635                 # if needed add http://www.dailymotion.com/ if relative URL
 636
 637                 video_url = mediaURL
 638
 639                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 640                 if mobj is None:
 641                         self._downloader.trouble(u'ERROR: unable to extract title')
 642                         return
 643                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 644
 645                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 646                 if mobj is None:
 647                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 648                         return
 649                 video_uploader = mobj.group(1)
 650
 651                 return [{
 652                         'id':           video_id.decode('utf-8'),
 653                         'url':          video_url.decode('utf-8'),
 654                         'uploader':     video_uploader.decode('utf-8'),
 655                         'upload_date':  u'NA',
 656                         'title':        video_title,
 657                         'ext':          video_extension.decode('utf-8'),
 658                         'format':       u'NA',
 659                         'player_url':   None,
 660                 }]
 661
 662
 663 class GoogleIE(InfoExtractor):
 664         """Information extractor for video.google.com."""
 665
 666         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 667         IE_NAME = u'video.google'
 668
 669         def __init__(self, downloader=None):
 670                 InfoExtractor.__init__(self, downloader)
 671
 672         def report_download_webpage(self, video_id):
 673                 """Report webpage download."""
 674                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 675
 676         def report_extraction(self, video_id):
 677                 """Report information extraction."""
 678                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 679
 680         def _real_extract(self, url):
 681                 # Extract id from URL
 682                 mobj = re.match(self._VALID_URL, url)
 683                 if mobj is None:
 684                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 685                         return
 686
 687                 video_id = mobj.group(1)
 688
 689                 video_extension = 'mp4'
 690
 691                 # Retrieve video webpage to extract further information
 692                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 693                 try:
 694                         self.report_download_webpage(video_id)
 695                         webpage = urllib2.urlopen(request).read()
 696                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 697                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 698                         return
 699
 700                 # Extract URL, uploader, and title from webpage
 701                 self.report_extraction(video_id)
 702                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 703                 if mobj is None:
 704                         video_extension = 'flv'
 705                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 706                 if mobj is None:
 707                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 708                         return
 709                 mediaURL = urllib.unquote(mobj.group(1))
 710                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 711                 mediaURL = mediaURL.replace('\\x26', '\x26')
 712
 713                 video_url = mediaURL
 714
 715                 mobj = re.search(r'<title>(.*)</title>', webpage)
 716                 if mobj is None:
 717                         self._downloader.trouble(u'ERROR: unable to extract title')
 718                         return
 719                 video_title = mobj.group(1).decode('utf-8')
 720
 721                 # Extract video description
 722                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 723                 if mobj is None:
 724                         self._downloader.trouble(u'ERROR: unable to extract video description')
 725                         return
 726                 video_description = mobj.group(1).decode('utf-8')
 727                 if not video_description:
 728                         video_description = 'No description available.'
 729
 730                 # Extract video thumbnail
 731                 if self._downloader.params.get('forcethumbnail', False):
 732                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 733                         try:
 734                                 webpage = urllib2.urlopen(request).read()
 735                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 736                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 737                                 return
 738                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 739                         if mobj is None:
 740                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 741                                 return
 742                         video_thumbnail = mobj.group(1)
 743                 else:   # we need something to pass to process_info
 744                         video_thumbnail = ''
 745
 746                 return [{
 747                         'id':           video_id.decode('utf-8'),
 748                         'url':          video_url.decode('utf-8'),
 749                         'uploader':     u'NA',
 750                         'upload_date':  u'NA',
 751                         'title':        video_title,
 752                         'ext':          video_extension.decode('utf-8'),
 753                         'format':       u'NA',
 754                         'player_url':   None,
 755                 }]
 756
 757
 758 class PhotobucketIE(InfoExtractor):
 759         """Information extractor for photobucket.com."""
 760
 761         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 762         IE_NAME = u'photobucket'
 763
 764         def __init__(self, downloader=None):
 765                 InfoExtractor.__init__(self, downloader)
 766
 767         def report_download_webpage(self, video_id):
 768                 """Report webpage download."""
 769                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 770
 771         def report_extraction(self, video_id):
 772                 """Report information extraction."""
 773                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 774
 775         def _real_extract(self, url):
 776                 # Extract id from URL
 777                 mobj = re.match(self._VALID_URL, url)
 778                 if mobj is None:
 779                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 780                         return
 781
 782                 video_id = mobj.group(1)
 783
 784                 video_extension = 'flv'
 785
 786                 # Retrieve video webpage to extract further information
 787                 request = urllib2.Request(url)
 788                 try:
 789                         self.report_download_webpage(video_id)
 790                         webpage = urllib2.urlopen(request).read()
 791                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 792                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 793                         return
 794
 795                 # Extract URL, uploader, and title from webpage
 796                 self.report_extraction(video_id)
 797                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 798                 if mobj is None:
 799                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 800                         return
 801                 mediaURL = urllib.unquote(mobj.group(1))
 802
 803                 video_url = mediaURL
 804
 805                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 806                 if mobj is None:
 807                         self._downloader.trouble(u'ERROR: unable to extract title')
 808                         return
 809                 video_title = mobj.group(1).decode('utf-8')
 810
 811                 video_uploader = mobj.group(2).decode('utf-8')
 812
 813                 return [{
 814                         'id':           video_id.decode('utf-8'),
 815                         'url':          video_url.decode('utf-8'),
 816                         'uploader':     video_uploader,
 817                         'upload_date':  u'NA',
 818                         'title':        video_title,
 819                         'ext':          video_extension.decode('utf-8'),
 820                         'format':       u'NA',
 821                         'player_url':   None,
 822                 }]
 823
 824
 825 class YahooIE(InfoExtractor):
 826         """Information extractor for video.yahoo.com."""
 827
 828         # _VALID_URL matches all Yahoo! Video URLs
 829         # _VPAGE_URL matches only the extractable '/watch/' URLs
 830         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 831         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 832         IE_NAME = u'video.yahoo'
 833
 834         def __init__(self, downloader=None):
 835                 InfoExtractor.__init__(self, downloader)
 836
 837         def report_download_webpage(self, video_id):
 838                 """Report webpage download."""
 839                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 840
 841         def report_extraction(self, video_id):
 842                 """Report information extraction."""
 843                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 844
 845         def _real_extract(self, url, new_video=True):
 846                 # Extract ID from URL
 847                 mobj = re.match(self._VALID_URL, url)
 848                 if mobj is None:
 849                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 850                         return
 851
 852                 video_id = mobj.group(2)
 853                 video_extension = 'flv'
 854
 855                 # Rewrite valid but non-extractable URLs as
 856                 # extractable English language /watch/ URLs
 857                 if re.match(self._VPAGE_URL, url) is None:
 858                         request = urllib2.Request(url)
 859                         try:
 860                                 webpage = urllib2.urlopen(request).read()
 861                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 862                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 863                                 return
 864
 865                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 866                         if mobj is None:
 867                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 868                                 return
 869                         yahoo_id = mobj.group(1)
 870
 871                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 872                         if mobj is None:
 873                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 874                                 return
 875                         yahoo_vid = mobj.group(1)
 876
 877                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 878                         return self._real_extract(url, new_video=False)
 879
 880                 # Retrieve video webpage to extract further information
 881                 request = urllib2.Request(url)
 882                 try:
 883                         self.report_download_webpage(video_id)
 884                         webpage = urllib2.urlopen(request).read()
 885                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 886                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 887                         return
 888
 889                 # Extract uploader and title from webpage
 890                 self.report_extraction(video_id)
 891                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 892                 if mobj is None:
 893                         self._downloader.trouble(u'ERROR: unable to extract video title')
 894                         return
 895                 video_title = mobj.group(1).decode('utf-8')
 896
 897                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 898                 if mobj is None:
 899                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 900                         return
 901                 video_uploader = mobj.group(1).decode('utf-8')
 902
 903                 # Extract video thumbnail
 904                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 905                 if mobj is None:
 906                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 907                         return
 908                 video_thumbnail = mobj.group(1).decode('utf-8')
 909
 910                 # Extract video description
 911                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 912                 if mobj is None:
 913                         self._downloader.trouble(u'ERROR: unable to extract video description')
 914                         return
 915                 video_description = mobj.group(1).decode('utf-8')
 916                 if not video_description:
 917                         video_description = 'No description available.'
 918
 919                 # Extract video height and width
 920                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 921                 if mobj is None:
 922                         self._downloader.trouble(u'ERROR: unable to extract video height')
 923                         return
 924                 yv_video_height = mobj.group(1)
 925
 926                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 927                 if mobj is None:
 928                         self._downloader.trouble(u'ERROR: unable to extract video width')
 929                         return
 930                 yv_video_width = mobj.group(1)
 931
 932                 # Retrieve video playlist to extract media URL
 933                 # I'm not completely sure what all these options are, but we
 934                 # seem to need most of them, otherwise the server sends a 401.
 935                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 936                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 937                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 938                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 939                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 940                 try:
 941                         self.report_download_webpage(video_id)
 942                         webpage = urllib2.urlopen(request).read()
 943                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 944                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 945                         return
 946
 947                 # Extract media URL from playlist XML
 948                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 949                 if mobj is None:
 950                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 951                         return
 952                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 953                 video_url = unescapeHTML(video_url)
 954
 955                 return [{
 956                         'id':           video_id.decode('utf-8'),
 957                         'url':          video_url,
 958                         'uploader':     video_uploader,
 959                         'upload_date':  u'NA',
 960                         'title':        video_title,
 961                         'ext':          video_extension.decode('utf-8'),
 962                         'thumbnail':    video_thumbnail.decode('utf-8'),
 963                         'description':  video_description,
 964                         'thumbnail':    video_thumbnail,
 965                         'player_url':   None,
 966                 }]
 967
 968
 969 class VimeoIE(InfoExtractor):
 970         """Information extractor for vimeo.com."""
 971
 972         # _VALID_URL matches Vimeo URLs
 973         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
 974         IE_NAME = u'vimeo'
 975
 976         def __init__(self, downloader=None):
 977                 InfoExtractor.__init__(self, downloader)
 978
 979         def report_download_webpage(self, video_id):
 980                 """Report webpage download."""
 981                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 982
 983         def report_extraction(self, video_id):
 984                 """Report information extraction."""
 985                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 986
 987         def _real_extract(self, url, new_video=True):
 988                 # Extract ID from URL
 989                 mobj = re.match(self._VALID_URL, url)
 990                 if mobj is None:
 991                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 992                         return
 993
 994                 video_id = mobj.group(1)
 995
 996                 # Retrieve video webpage to extract further information
 997                 request = urllib2.Request(url, None, std_headers)
 998                 try:
 999                         self.report_download_webpage(video_id)
1000                         webpage = urllib2.urlopen(request).read()
1001                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1002                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1003                         return
1004
1005                 # Now we begin extracting as much information as we can from what we
1006                 # retrieved. First we extract the information common to all extractors,
1007                 # and latter we extract those that are Vimeo specific.
1008                 self.report_extraction(video_id)
1009
1010                 # Extract the config JSON
1011                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1012                 try:
1013                         config = json.loads(config)
1014                 except:
1015                         self._downloader.trouble(u'ERROR: unable to extract info section')
1016                         return
1017
1018                 # Extract title
1019                 video_title = config["video"]["title"]
1020
1021                 # Extract uploader
1022                 video_uploader = config["video"]["owner"]["name"]
1023
1024                 # Extract video thumbnail
1025                 video_thumbnail = config["video"]["thumbnail"]
1026
1027                 # Extract video description
1028                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1029                 if video_description: video_description = clean_html(video_description)
1030                 else: video_description = ''
1031
1032                 # Extract upload date
1033                 video_upload_date = u'NA'
1034                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1035                 if mobj is not None:
1036                         video_upload_date = mobj.group(1)
1037
1038                 # Vimeo specific: extract request signature and timestamp
1039                 sig = config['request']['signature']
1040                 timestamp = config['request']['timestamp']
1041
1042                 # Vimeo specific: extract video codec and quality information
1043                 # TODO bind to format param
1044                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1045                 for codec in codecs:
1046                         if codec[0] in config["video"]["files"]:
1047                                 video_codec = codec[0]
1048                                 video_extension = codec[1]
1049                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1050                                 else: quality = 'sd'
1051                                 break
1052                 else:
1053                         self._downloader.trouble(u'ERROR: no known codec found')
1054                         return
1055
1056                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1057                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1058
1059                 return [{
1060                         'id':           video_id,
1061                         'url':          video_url,
1062                         'uploader':     video_uploader,
1063                         'upload_date':  video_upload_date,
1064                         'title':        video_title,
1065                         'ext':          video_extension,
1066                         'thumbnail':    video_thumbnail,
1067                         'description':  video_description,
1068                         'player_url':   None,
1069                 }]
1070
1071
1072 class GenericIE(InfoExtractor):
1073         """Generic last-resort information extractor."""
1074
1075         _VALID_URL = r'.*'
1076         IE_NAME = u'generic'
1077
1078         def __init__(self, downloader=None):
1079                 InfoExtractor.__init__(self, downloader)
1080
1081         def report_download_webpage(self, video_id):
1082                 """Report webpage download."""
1083                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1084                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1085
1086         def report_extraction(self, video_id):
1087                 """Report information extraction."""
1088                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1089
1090         def report_following_redirect(self, new_url):
1091                 """Report information extraction."""
1092                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1093
1094         def _test_redirect(self, url):
1095                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1096                 class HeadRequest(urllib2.Request):
1097                         def get_method(self):
1098                                 return "HEAD"
1099
1100                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1101                         """
1102                         Subclass the HTTPRedirectHandler to make it use our
1103                         HeadRequest also on the redirected URL
1104                         """
1105                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1106                                 if code in (301, 302, 303, 307):
1107                                         newurl = newurl.replace(' ', '%20')
1108                                         newheaders = dict((k,v) for k,v in req.headers.items()
1109                                                                           if k.lower() not in ("content-length", "content-type"))
1110                                         return HeadRequest(newurl,
1111                                                                            headers=newheaders,
1112                                                                            origin_req_host=req.get_origin_req_host(),
1113                                                                            unverifiable=True)
1114                                 else:
1115                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1116
1117                 class HTTPMethodFallback(urllib2.BaseHandler):
1118                         """
1119                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1120                         """
1121                         def http_error_405(self, req, fp, code, msg, headers):
1122                                 fp.read()
1123                                 fp.close()
1124
1125                                 newheaders = dict((k,v) for k,v in req.headers.items()
1126                                                                   if k.lower() not in ("content-length", "content-type"))
1127                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1128                                                                                                  headers=newheaders,
1129                                                                                                  origin_req_host=req.get_origin_req_host(),
1130                                                                                                  unverifiable=True))
1131
1132                 # Build our opener
1133                 opener = urllib2.OpenerDirector()
1134                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1135                                                 HTTPMethodFallback, HEADRedirectHandler,
1136                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1137                         opener.add_handler(handler())
1138
1139                 response = opener.open(HeadRequest(url))
1140                 new_url = response.geturl()
1141
1142                 if url == new_url: return False
1143
1144                 self.report_following_redirect(new_url)
1145                 self._downloader.download([new_url])
1146                 return True
1147
1148         def _real_extract(self, url):
1149                 if self._test_redirect(url): return
1150
1151                 video_id = url.split('/')[-1]
1152                 request = urllib2.Request(url)
1153                 try:
1154                         self.report_download_webpage(video_id)
1155                         webpage = urllib2.urlopen(request).read()
1156                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1157                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1158                         return
1159                 except ValueError, err:
1160                         # since this is the last-resort InfoExtractor, if
1161                         # this error is thrown, it'll be thrown here
1162                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1163                         return
1164
1165                 self.report_extraction(video_id)
1166                 # Start with something easy: JW Player in SWFObject
1167                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1168                 if mobj is None:
1169                         # Broaden the search a little bit
1170                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1171                 if mobj is None:
1172                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1173                         return
1174
1175                 # It's possible that one of the regexes
1176                 # matched, but returned an empty group:
1177                 if mobj.group(1) is None:
1178                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1179                         return
1180
1181                 video_url = urllib.unquote(mobj.group(1))
1182                 video_id = os.path.basename(video_url)
1183
1184                 # here's a fun little line of code for you:
1185                 video_extension = os.path.splitext(video_id)[1][1:]
1186                 video_id = os.path.splitext(video_id)[0]
1187
1188                 # it's tempting to parse this further, but you would
1189                 # have to take into account all the variations like
1190                 #   Video Title - Site Name
1191                 #   Site Name | Video Title
1192                 #   Video Title - Tagline | Site Name
1193                 # and so on and so forth; it's just not practical
1194                 mobj = re.search(r'<title>(.*)</title>', webpage)
1195                 if mobj is None:
1196                         self._downloader.trouble(u'ERROR: unable to extract title')
1197                         return
1198                 video_title = mobj.group(1).decode('utf-8')
1199
1200                 # video uploader is domain name
1201                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1202                 if mobj is None:
1203                         self._downloader.trouble(u'ERROR: unable to extract title')
1204                         return
1205                 video_uploader = mobj.group(1).decode('utf-8')
1206
1207                 return [{
1208                         'id':           video_id.decode('utf-8'),
1209                         'url':          video_url.decode('utf-8'),
1210                         'uploader':     video_uploader,
1211                         'upload_date':  u'NA',
1212                         'title':        video_title,
1213                         'ext':          video_extension.decode('utf-8'),
1214                         'format':       u'NA',
1215                         'player_url':   None,
1216                 }]
1217
1218
1219 class YoutubeSearchIE(InfoExtractor):
1220         """Information Extractor for YouTube search queries."""
1221         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1222         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1223         _max_youtube_results = 1000
1224         IE_NAME = u'youtube:search'
1225
1226         def __init__(self, downloader=None):
1227                 InfoExtractor.__init__(self, downloader)
1228
1229         def report_download_page(self, query, pagenum):
1230                 """Report attempt to download playlist page with given number."""
1231                 query = query.decode(preferredencoding())
1232                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1233
1234         def _real_extract(self, query):
1235                 mobj = re.match(self._VALID_URL, query)
1236                 if mobj is None:
1237                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1238                         return
1239
1240                 prefix, query = query.split(':')
1241                 prefix = prefix[8:]
1242                 query = query.encode('utf-8')
1243                 if prefix == '':
1244                         self._download_n_results(query, 1)
1245                         return
1246                 elif prefix == 'all':
1247                         self._download_n_results(query, self._max_youtube_results)
1248                         return
1249                 else:
1250                         try:
1251                                 n = long(prefix)
1252                                 if n <= 0:
1253                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1254                                         return
1255                                 elif n > self._max_youtube_results:
1256                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1257                                         n = self._max_youtube_results
1258                                 self._download_n_results(query, n)
1259                                 return
1260                         except ValueError: # parsing prefix as integer fails
1261                                 self._download_n_results(query, 1)
1262                                 return
1263
1264         def _download_n_results(self, query, n):
1265                 """Downloads a specified number of results for a query"""
1266
1267                 video_ids = []
1268                 pagenum = 0
1269                 limit = n
1270
1271                 while (50 * pagenum) < limit:
1272                         self.report_download_page(query, pagenum+1)
1273                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1274                         request = urllib2.Request(result_url)
1275                         try:
1276                                 data = urllib2.urlopen(request).read()
1277                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1278                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1279                                 return
1280                         api_response = json.loads(data)['data']
1281
1282                         new_ids = list(video['id'] for video in api_response['items'])
1283                         video_ids += new_ids
1284
1285                         limit = min(n, api_response['totalItems'])
1286                         pagenum += 1
1287
1288                 if len(video_ids) > n:
1289                         video_ids = video_ids[:n]
1290                 for id in video_ids:
1291                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1292                 return
1293
1294
1295 class GoogleSearchIE(InfoExtractor):
1296         """Information Extractor for Google Video search queries."""
1297         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1298         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1299         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1300         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1301         _max_google_results = 1000
1302         IE_NAME = u'video.google:search'
1303
1304         def __init__(self, downloader=None):
1305                 InfoExtractor.__init__(self, downloader)
1306
1307         def report_download_page(self, query, pagenum):
1308                 """Report attempt to download playlist page with given number."""
1309                 query = query.decode(preferredencoding())
1310                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1311
1312         def _real_extract(self, query):
1313                 mobj = re.match(self._VALID_URL, query)
1314                 if mobj is None:
1315                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1316                         return
1317
1318                 prefix, query = query.split(':')
1319                 prefix = prefix[8:]
1320                 query = query.encode('utf-8')
1321                 if prefix == '':
1322                         self._download_n_results(query, 1)
1323                         return
1324                 elif prefix == 'all':
1325                         self._download_n_results(query, self._max_google_results)
1326                         return
1327                 else:
1328                         try:
1329                                 n = long(prefix)
1330                                 if n <= 0:
1331                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1332                                         return
1333                                 elif n > self._max_google_results:
1334                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1335                                         n = self._max_google_results
1336                                 self._download_n_results(query, n)
1337                                 return
1338                         except ValueError: # parsing prefix as integer fails
1339                                 self._download_n_results(query, 1)
1340                                 return
1341
1342         def _download_n_results(self, query, n):
1343                 """Downloads a specified number of results for a query"""
1344
1345                 video_ids = []
1346                 pagenum = 0
1347
1348                 while True:
1349                         self.report_download_page(query, pagenum)
1350                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1351                         request = urllib2.Request(result_url)
1352                         try:
1353                                 page = urllib2.urlopen(request).read()
1354                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1355                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1356                                 return
1357
1358                         # Extract video identifiers
1359                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1360                                 video_id = mobj.group(1)
1361                                 if video_id not in video_ids:
1362                                         video_ids.append(video_id)
1363                                         if len(video_ids) == n:
1364                                                 # Specified n videos reached
1365                                                 for id in video_ids:
1366                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1367                                                 return
1368
1369                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1370                                 for id in video_ids:
1371                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1372                                 return
1373
1374                         pagenum = pagenum + 1
1375
1376
1377 class YahooSearchIE(InfoExtractor):
1378         """Information Extractor for Yahoo! Video search queries."""
1379         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1380         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1381         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1382         _MORE_PAGES_INDICATOR = r'\s*Next'
1383         _max_yahoo_results = 1000
1384         IE_NAME = u'video.yahoo:search'
1385
1386         def __init__(self, downloader=None):
1387                 InfoExtractor.__init__(self, downloader)
1388
1389         def report_download_page(self, query, pagenum):
1390                 """Report attempt to download playlist page with given number."""
1391                 query = query.decode(preferredencoding())
1392                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1393
1394         def _real_extract(self, query):
1395                 mobj = re.match(self._VALID_URL, query)
1396                 if mobj is None:
1397                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1398                         return
1399
1400                 prefix, query = query.split(':')
1401                 prefix = prefix[8:]
1402                 query = query.encode('utf-8')
1403                 if prefix == '':
1404                         self._download_n_results(query, 1)
1405                         return
1406                 elif prefix == 'all':
1407                         self._download_n_results(query, self._max_yahoo_results)
1408                         return
1409                 else:
1410                         try:
1411                                 n = long(prefix)
1412                                 if n <= 0:
1413                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1414                                         return
1415                                 elif n > self._max_yahoo_results:
1416                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1417                                         n = self._max_yahoo_results
1418                                 self._download_n_results(query, n)
1419                                 return
1420                         except ValueError: # parsing prefix as integer fails
1421                                 self._download_n_results(query, 1)
1422                                 return
1423
1424         def _download_n_results(self, query, n):
1425                 """Downloads a specified number of results for a query"""
1426
1427                 video_ids = []
1428                 already_seen = set()
1429                 pagenum = 1
1430
1431                 while True:
1432                         self.report_download_page(query, pagenum)
1433                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1434                         request = urllib2.Request(result_url)
1435                         try:
1436                                 page = urllib2.urlopen(request).read()
1437                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1438                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1439                                 return
1440
1441                         # Extract video identifiers
1442                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1443                                 video_id = mobj.group(1)
1444                                 if video_id not in already_seen:
1445                                         video_ids.append(video_id)
1446                                         already_seen.add(video_id)
1447                                         if len(video_ids) == n:
1448                                                 # Specified n videos reached
1449                                                 for id in video_ids:
1450                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1451                                                 return
1452
1453                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1454                                 for id in video_ids:
1455                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1456                                 return
1457
1458                         pagenum = pagenum + 1
1459
1460
1461 class YoutubePlaylistIE(InfoExtractor):
1462         """Information Extractor for YouTube playlists."""
1463
1464         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1465         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1466         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
1467         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1468         IE_NAME = u'youtube:playlist'
1469
1470         def __init__(self, downloader=None):
1471                 InfoExtractor.__init__(self, downloader)
1472
1473         def report_download_page(self, playlist_id, pagenum):
1474                 """Report attempt to download playlist page with given number."""
1475                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1476
1477         def _real_extract(self, url):
1478                 # Extract playlist id
1479                 mobj = re.match(self._VALID_URL, url)
1480                 if mobj is None:
1481                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1482                         return
1483
1484                 # Single video case
1485                 if mobj.group(3) is not None:
1486                         self._downloader.download([mobj.group(3)])
1487                         return
1488
1489                 # Download playlist pages
1490                 # prefix is 'p' as default for playlists but there are other types that need extra care
1491                 playlist_prefix = mobj.group(1)
1492                 if playlist_prefix == 'a':
1493                         playlist_access = 'artist'
1494                 else:
1495                         playlist_prefix = 'p'
1496                         playlist_access = 'view_play_list'
1497                 playlist_id = mobj.group(2)
1498                 video_ids = []
1499                 pagenum = 1
1500
1501                 while True:
1502                         self.report_download_page(playlist_id, pagenum)
1503                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1504                         request = urllib2.Request(url)
1505                         try:
1506                                 page = urllib2.urlopen(request).read()
1507                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1508                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1509                                 return
1510
1511                         # Extract video identifiers
1512                         ids_in_page = []
1513                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1514                                 if mobj.group(1) not in ids_in_page:
1515                                         ids_in_page.append(mobj.group(1))
1516                         video_ids.extend(ids_in_page)
1517
1518                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1519                                 break
1520                         pagenum = pagenum + 1
1521
1522                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1523                 playlistend = self._downloader.params.get('playlistend', -1)
1524                 if playlistend == -1:
1525                         video_ids = video_ids[playliststart:]
1526                 else:
1527                         video_ids = video_ids[playliststart:playlistend]
1528
1529                 for id in video_ids:
1530                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1531                 return
1532
1533
1534 class YoutubeUserIE(InfoExtractor):
1535         """Information Extractor for YouTube users."""
1536
1537         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1538         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1539         _GDATA_PAGE_SIZE = 50
1540         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1541         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1542         IE_NAME = u'youtube:user'
1543
1544         def __init__(self, downloader=None):
1545                 InfoExtractor.__init__(self, downloader)
1546
1547         def report_download_page(self, username, start_index):
1548                 """Report attempt to download user page."""
1549                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1550                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1551
1552         def _real_extract(self, url):
1553                 # Extract username
1554                 mobj = re.match(self._VALID_URL, url)
1555                 if mobj is None:
1556                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1557                         return
1558
1559                 username = mobj.group(1)
1560
1561                 # Download video ids using YouTube Data API. Result size per
1562                 # query is limited (currently to 50 videos) so we need to query
1563                 # page by page until there are no video ids - it means we got
1564                 # all of them.
1565
1566                 video_ids = []
1567                 pagenum = 0
1568
1569                 while True:
1570                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1571                         self.report_download_page(username, start_index)
1572
1573                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1574
1575                         try:
1576                                 page = urllib2.urlopen(request).read()
1577                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1578                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1579                                 return
1580
1581                         # Extract video identifiers
1582                         ids_in_page = []
1583
1584                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1585                                 if mobj.group(1) not in ids_in_page:
1586                                         ids_in_page.append(mobj.group(1))
1587
1588                         video_ids.extend(ids_in_page)
1589
1590                         # A little optimization - if current page is not
1591                         # "full", ie. does not contain PAGE_SIZE video ids then
1592                         # we can assume that this page is the last one - there
1593                         # are no more ids on further pages - no need to query
1594                         # again.
1595
1596                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1597                                 break
1598
1599                         pagenum += 1
1600
1601                 all_ids_count = len(video_ids)
1602                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1603                 playlistend = self._downloader.params.get('playlistend', -1)
1604
1605                 if playlistend == -1:
1606                         video_ids = video_ids[playliststart:]
1607                 else:
1608                         video_ids = video_ids[playliststart:playlistend]
1609
1610                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1611                                 (username, all_ids_count, len(video_ids)))
1612
1613                 for video_id in video_ids:
1614                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1615
1616
1617 class BlipTVUserIE(InfoExtractor):
1618         """Information Extractor for blip.tv users."""
1619
1620         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1621         _PAGE_SIZE = 10
1622         IE_NAME = u'blip.tv:user'
1623
1624         def __init__(self, downloader=None):
1625                 InfoExtractor.__init__(self, downloader)
1626
1627         def report_download_page(self, username, pagenum):
1628                 """Report attempt to download user page."""
1629                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1630                                 (self.IE_NAME, username, pagenum))
1631
1632         def _real_extract(self, url):
1633                 # Extract username
1634                 mobj = re.match(self._VALID_URL, url)
1635                 if mobj is None:
1636                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1637                         return
1638
1639                 username = mobj.group(1)
1640
1641                 page_base = None
1642
1643                 request = urllib2.Request(url)
1644
1645                 try:
1646                         page = urllib2.urlopen(request).read().decode('utf-8')
1647                         mobj = re.search(r'data-source-url="([^"]+)"', page)
1648                         page_base = "http://blip.tv" + unescapeHTML(mobj.group(1))
1649                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1650                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1651                         return
1652
1653
1654                 # Download video ids using BlipTV Page API. Result size per
1655                 # query is limited (currently to 10 videos) so we need to query
1656                 # page by page until there are no video ids - it means we got
1657                 # all of them.
1658
1659                 video_ids = []
1660                 pagenum = 0
1661
1662                 while True:
1663                         self.report_download_page(username, pagenum)
1664
1665                         request = urllib2.Request( page_base + "&page=" + str(pagenum+1) )
1666
1667                         try:
1668                                 page = urllib2.urlopen(request).read().decode('utf-8')
1669                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1670                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1671                                 return
1672
1673                         # Extract video identifiers
1674                         ids_in_page = []
1675
1676                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1677                                 if mobj.group(1) not in ids_in_page:
1678                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1679
1680                         video_ids.extend(ids_in_page)
1681
1682                         # A little optimization - if current page is not
1683                         # "full", ie. does not contain PAGE_SIZE video ids then
1684                         # we can assume that this page is the last one - there
1685                         # are no more ids on further pages - no need to query
1686                         # again.
1687
1688                         if len(ids_in_page) < self._PAGE_SIZE:
1689                                 break
1690
1691                         pagenum += 1
1692
1693                 all_ids_count = len(video_ids)
1694                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1695                 playlistend = self._downloader.params.get('playlistend', -1)
1696
1697                 if playlistend == -1:
1698                         video_ids = video_ids[playliststart:]
1699                 else:
1700                         video_ids = video_ids[playliststart:playlistend]
1701
1702                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1703                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1704
1705                 for video_id in video_ids:
1706                         self._downloader.download([u'http://blip.tv/'+video_id])
1707
1708
1709 class DepositFilesIE(InfoExtractor):
1710         """Information extractor for depositfiles.com"""
1711
1712         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1713         IE_NAME = u'DepositFiles'
1714
1715         def __init__(self, downloader=None):
1716                 InfoExtractor.__init__(self, downloader)
1717
1718         def report_download_webpage(self, file_id):
1719                 """Report webpage download."""
1720                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1721
1722         def report_extraction(self, file_id):
1723                 """Report information extraction."""
1724                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1725
1726         def _real_extract(self, url):
1727                 file_id = url.split('/')[-1]
1728                 # Rebuild url in english locale
1729                 url = 'http://depositfiles.com/en/files/' + file_id
1730
1731                 # Retrieve file webpage with 'Free download' button pressed
1732                 free_download_indication = { 'gateway_result' : '1' }
1733                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1734                 try:
1735                         self.report_download_webpage(file_id)
1736                         webpage = urllib2.urlopen(request).read()
1737                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1738                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1739                         return
1740
1741                 # Search for the real file URL
1742                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1743                 if (mobj is None) or (mobj.group(1) is None):
1744                         # Try to figure out reason of the error.
1745                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1746                         if (mobj is not None) and (mobj.group(1) is not None):
1747                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1748                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1749                         else:
1750                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1751                         return
1752
1753                 file_url = mobj.group(1)
1754                 file_extension = os.path.splitext(file_url)[1][1:]
1755
1756                 # Search for file title
1757                 mobj = re.search(r'<b title="(.*?)">', webpage)
1758                 if mobj is None:
1759                         self._downloader.trouble(u'ERROR: unable to extract title')
1760                         return
1761                 file_title = mobj.group(1).decode('utf-8')
1762
1763                 return [{
1764                         'id':           file_id.decode('utf-8'),
1765                         'url':          file_url.decode('utf-8'),
1766                         'uploader':     u'NA',
1767                         'upload_date':  u'NA',
1768                         'title':        file_title,
1769                         'ext':          file_extension.decode('utf-8'),
1770                         'format':       u'NA',
1771                         'player_url':   None,
1772                 }]
1773
1774
1775 class FacebookIE(InfoExtractor):
1776         """Information Extractor for Facebook"""
1777
1778         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1779         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1780         _NETRC_MACHINE = 'facebook'
1781         _available_formats = ['video', 'highqual', 'lowqual']
1782         _video_extensions = {
1783                 'video': 'mp4',
1784                 'highqual': 'mp4',
1785                 'lowqual': 'mp4',
1786         }
1787         IE_NAME = u'facebook'
1788
1789         def __init__(self, downloader=None):
1790                 InfoExtractor.__init__(self, downloader)
1791
1792         def _reporter(self, message):
1793                 """Add header and report message."""
1794                 self._downloader.to_screen(u'[facebook] %s' % message)
1795
1796         def report_login(self):
1797                 """Report attempt to log in."""
1798                 self._reporter(u'Logging in')
1799
1800         def report_video_webpage_download(self, video_id):
1801                 """Report attempt to download video webpage."""
1802                 self._reporter(u'%s: Downloading video webpage' % video_id)
1803
1804         def report_information_extraction(self, video_id):
1805                 """Report attempt to extract video information."""
1806                 self._reporter(u'%s: Extracting video information' % video_id)
1807
1808         def _parse_page(self, video_webpage):
1809                 """Extract video information from page"""
1810                 # General data
1811                 data = {'title': r'\("video_title", "(.*?)"\)',
1812                         'description': r'<div class="datawrap">(.*?)</div>',
1813                         'owner': r'\("video_owner_name", "(.*?)"\)',
1814                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1815                         }
1816                 video_info = {}
1817                 for piece in data.keys():
1818                         mobj = re.search(data[piece], video_webpage)
1819                         if mobj is not None:
1820                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1821
1822                 # Video urls
1823                 video_urls = {}
1824                 for fmt in self._available_formats:
1825                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1826                         if mobj is not None:
1827                                 # URL is in a Javascript segment inside an escaped Unicode format within
1828                                 # the generally utf-8 page
1829                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1830                 video_info['video_urls'] = video_urls
1831
1832                 return video_info
1833
1834         def _real_initialize(self):
1835                 if self._downloader is None:
1836                         return
1837
1838                 useremail = None
1839                 password = None
1840                 downloader_params = self._downloader.params
1841
1842                 # Attempt to use provided username and password or .netrc data
1843                 if downloader_params.get('username', None) is not None:
1844                         useremail = downloader_params['username']
1845                         password = downloader_params['password']
1846                 elif downloader_params.get('usenetrc', False):
1847                         try:
1848                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1849                                 if info is not None:
1850                                         useremail = info[0]
1851                                         password = info[2]
1852                                 else:
1853                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1854                         except (IOError, netrc.NetrcParseError), err:
1855                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1856                                 return
1857
1858                 if useremail is None:
1859                         return
1860
1861                 # Log in
1862                 login_form = {
1863                         'email': useremail,
1864                         'pass': password,
1865                         'login': 'Log+In'
1866                         }
1867                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1868                 try:
1869                         self.report_login()
1870                         login_results = urllib2.urlopen(request).read()
1871                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1872                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1873                                 return
1874                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1875                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1876                         return
1877
1878         def _real_extract(self, url):
1879                 mobj = re.match(self._VALID_URL, url)
1880                 if mobj is None:
1881                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1882                         return
1883                 video_id = mobj.group('ID')
1884
1885                 # Get video webpage
1886                 self.report_video_webpage_download(video_id)
1887                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1888                 try:
1889                         page = urllib2.urlopen(request)
1890                         video_webpage = page.read()
1891                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1892                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1893                         return
1894
1895                 # Start extracting information
1896                 self.report_information_extraction(video_id)
1897
1898                 # Extract information
1899                 video_info = self._parse_page(video_webpage)
1900
1901                 # uploader
1902                 if 'owner' not in video_info:
1903                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1904                         return
1905                 video_uploader = video_info['owner']
1906
1907                 # title
1908                 if 'title' not in video_info:
1909                         self._downloader.trouble(u'ERROR: unable to extract video title')
1910                         return
1911                 video_title = video_info['title']
1912                 video_title = video_title.decode('utf-8')
1913
1914                 # thumbnail image
1915                 if 'thumbnail' not in video_info:
1916                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1917                         video_thumbnail = ''
1918                 else:
1919                         video_thumbnail = video_info['thumbnail']
1920
1921                 # upload date
1922                 upload_date = u'NA'
1923                 if 'upload_date' in video_info:
1924                         upload_time = video_info['upload_date']
1925                         timetuple = email.utils.parsedate_tz(upload_time)
1926                         if timetuple is not None:
1927                                 try:
1928                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1929                                 except:
1930                                         pass
1931
1932                 # description
1933                 video_description = video_info.get('description', 'No description available.')
1934
1935                 url_map = video_info['video_urls']
1936                 if len(url_map.keys()) > 0:
1937                         # Decide which formats to download
1938                         req_format = self._downloader.params.get('format', None)
1939                         format_limit = self._downloader.params.get('format_limit', None)
1940
1941                         if format_limit is not None and format_limit in self._available_formats:
1942                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1943                         else:
1944                                 format_list = self._available_formats
1945                         existing_formats = [x for x in format_list if x in url_map]
1946                         if len(existing_formats) == 0:
1947                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1948                                 return
1949                         if req_format is None:
1950                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1951                         elif req_format == 'worst':
1952                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1953                         elif req_format == '-1':
1954                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1955                         else:
1956                                 # Specific format
1957                                 if req_format not in url_map:
1958                                         self._downloader.trouble(u'ERROR: requested format not available')
1959                                         return
1960                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1961
1962                 results = []
1963                 for format_param, video_real_url in video_url_list:
1964                         # Extension
1965                         video_extension = self._video_extensions.get(format_param, 'mp4')
1966
1967                         results.append({
1968                                 'id':           video_id.decode('utf-8'),
1969                                 'url':          video_real_url.decode('utf-8'),
1970                                 'uploader':     video_uploader.decode('utf-8'),
1971                                 'upload_date':  upload_date,
1972                                 'title':        video_title,
1973                                 'ext':          video_extension.decode('utf-8'),
1974                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1975                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1976                                 'description':  video_description.decode('utf-8'),
1977                                 'player_url':   None,
1978                         })
1979                 return results
1980
1981 class BlipTVIE(InfoExtractor):
1982         """Information extractor for blip.tv"""
1983
1984         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1985         _URL_EXT = r'^.*\.([a-z0-9]+)$'
1986         IE_NAME = u'blip.tv'
1987
1988         def report_extraction(self, file_id):
1989                 """Report information extraction."""
1990                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1991
1992         def report_direct_download(self, title):
1993                 """Report information extraction."""
1994                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1995
1996         def _real_extract(self, url):
1997                 mobj = re.match(self._VALID_URL, url)
1998                 if mobj is None:
1999                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2000                         return
2001
2002                 if '?' in url:
2003                         cchar = '&'
2004                 else:
2005                         cchar = '?'
2006                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2007                 request = urllib2.Request(json_url)
2008                 self.report_extraction(mobj.group(1))
2009                 info = None
2010                 try:
2011                         urlh = urllib2.urlopen(request)
2012                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2013                                 basename = url.split('/')[-1]
2014                                 title,ext = os.path.splitext(basename)
2015                                 title = title.decode('UTF-8')
2016                                 ext = ext.replace('.', '')
2017                                 self.report_direct_download(title)
2018                                 info = {
2019                                         'id': title,
2020                                         'url': url,
2021                                         'title': title,
2022                                         'ext': ext,
2023                                         'urlhandle': urlh
2024                                 }
2025                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2026                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2027                         return
2028                 if info is None: # Regular URL
2029                         try:
2030                                 json_code = urlh.read()
2031                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2032                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2033                                 return
2034
2035                         try:
2036                                 json_data = json.loads(json_code)
2037                                 if 'Post' in json_data:
2038                                         data = json_data['Post']
2039                                 else:
2040                                         data = json_data
2041
2042                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2043                                 video_url = data['media']['url']
2044                                 umobj = re.match(self._URL_EXT, video_url)
2045                                 if umobj is None:
2046                                         raise ValueError('Can not determine filename extension')
2047                                 ext = umobj.group(1)
2048
2049                                 info = {
2050                                         'id': data['item_id'],
2051                                         'url': video_url,
2052                                         'uploader': data['display_name'],
2053                                         'upload_date': upload_date,
2054                                         'title': data['title'],
2055                                         'ext': ext,
2056                                         'format': data['media']['mimeType'],
2057                                         'thumbnail': data['thumbnailUrl'],
2058                                         'description': data['description'],
2059                                         'player_url': data['embedUrl']
2060                                 }
2061                         except (ValueError,KeyError), err:
2062                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2063                                 return
2064
2065                 return [info]
2066
2067
2068 class MyVideoIE(InfoExtractor):
2069         """Information Extractor for myvideo.de."""
2070
2071         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2072         IE_NAME = u'myvideo'
2073
2074         def __init__(self, downloader=None):
2075                 InfoExtractor.__init__(self, downloader)
2076
2077         def report_download_webpage(self, video_id):
2078                 """Report webpage download."""
2079                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2080
2081         def report_extraction(self, video_id):
2082                 """Report information extraction."""
2083                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2084
2085         def _real_extract(self,url):
2086                 mobj = re.match(self._VALID_URL, url)
2087                 if mobj is None:
2088                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2089                         return
2090
2091                 video_id = mobj.group(1)
2092
2093                 # Get video webpage
2094                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2095                 try:
2096                         self.report_download_webpage(video_id)
2097                         webpage = urllib2.urlopen(request).read()
2098                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2099                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2100                         return
2101
2102                 self.report_extraction(video_id)
2103                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2104                                  webpage)
2105                 if mobj is None:
2106                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2107                         return
2108                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2109
2110                 mobj = re.search('<title>([^<]+)</title>', webpage)
2111                 if mobj is None:
2112                         self._downloader.trouble(u'ERROR: unable to extract title')
2113                         return
2114
2115                 video_title = mobj.group(1)
2116
2117                 return [{
2118                         'id':           video_id,
2119                         'url':          video_url,
2120                         'uploader':     u'NA',
2121                         'upload_date':  u'NA',
2122                         'title':        video_title,
2123                         'ext':          u'flv',
2124                         'format':       u'NA',
2125                         'player_url':   None,
2126                 }]
2127
2128 class ComedyCentralIE(InfoExtractor):
2129         """Information extractor for The Daily Show and Colbert Report """
2130
2131         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2132         IE_NAME = u'comedycentral'
2133
2134         def report_extraction(self, episode_id):
2135                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2136
2137         def report_config_download(self, episode_id):
2138                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2139
2140         def report_index_download(self, episode_id):
2141                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2142
2143         def report_player_url(self, episode_id):
2144                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2145
2146         def _real_extract(self, url):
2147                 mobj = re.match(self._VALID_URL, url)
2148                 if mobj is None:
2149                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2150                         return
2151
2152                 if mobj.group('shortname'):
2153                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2154                                 url = u'http://www.thedailyshow.com/full-episodes/'
2155                         else:
2156                                 url = u'http://www.colbertnation.com/full-episodes/'
2157                         mobj = re.match(self._VALID_URL, url)
2158                         assert mobj is not None
2159
2160                 dlNewest = not mobj.group('episode')
2161                 if dlNewest:
2162                         epTitle = mobj.group('showname')
2163                 else:
2164                         epTitle = mobj.group('episode')
2165
2166                 req = urllib2.Request(url)
2167                 self.report_extraction(epTitle)
2168                 try:
2169                         htmlHandle = urllib2.urlopen(req)
2170                         html = htmlHandle.read()
2171                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2172                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2173                         return
2174                 if dlNewest:
2175                         url = htmlHandle.geturl()
2176                         mobj = re.match(self._VALID_URL, url)
2177                         if mobj is None:
2178                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2179                                 return
2180                         if mobj.group('episode') == '':
2181                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2182                                 return
2183                         epTitle = mobj.group('episode')
2184
2185                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2186                 if len(mMovieParams) == 0:
2187                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2188                         return
2189
2190                 playerUrl_raw = mMovieParams[0][0]
2191                 self.report_player_url(epTitle)
2192                 try:
2193                         urlHandle = urllib2.urlopen(playerUrl_raw)
2194                         playerUrl = urlHandle.geturl()
2195                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2196                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2197                         return
2198
2199                 uri = mMovieParams[0][1]
2200                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2201                 self.report_index_download(epTitle)
2202                 try:
2203                         indexXml = urllib2.urlopen(indexUrl).read()
2204                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2205                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2206                         return
2207
2208                 results = []
2209
2210                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2211                 itemEls = idoc.findall('.//item')
2212                 for itemEl in itemEls:
2213                         mediaId = itemEl.findall('./guid')[0].text
2214                         shortMediaId = mediaId.split(':')[-1]
2215                         showId = mediaId.split(':')[-2].replace('.com', '')
2216                         officialTitle = itemEl.findall('./title')[0].text
2217                         officialDate = itemEl.findall('./pubDate')[0].text
2218
2219                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2220                                                 urllib.urlencode({'uri': mediaId}))
2221                         configReq = urllib2.Request(configUrl)
2222                         self.report_config_download(epTitle)
2223                         try:
2224                                 configXml = urllib2.urlopen(configReq).read()
2225                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2226                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2227                                 return
2228
2229                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2230                         turls = []
2231                         for rendition in cdoc.findall('.//rendition'):
2232                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2233                                 turls.append(finfo)
2234
2235                         if len(turls) == 0:
2236                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2237                                 continue
2238
2239                         # For now, just pick the highest bitrate
2240                         format,video_url = turls[-1]
2241
2242                         effTitle = showId + u'-' + epTitle
2243                         info = {
2244                                 'id': shortMediaId,
2245                                 'url': video_url,
2246                                 'uploader': showId,
2247                                 'upload_date': officialDate,
2248                                 'title': effTitle,
2249                                 'ext': 'mp4',
2250                                 'format': format,
2251                                 'thumbnail': None,
2252                                 'description': officialTitle,
2253                                 'player_url': playerUrl
2254                         }
2255
2256                         results.append(info)
2257
2258                 return results
2259
2260
2261 class EscapistIE(InfoExtractor):
2262         """Information extractor for The Escapist """
2263
2264         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2265         IE_NAME = u'escapist'
2266
2267         def report_extraction(self, showName):
2268                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2269
2270         def report_config_download(self, showName):
2271                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2272
2273         def _real_extract(self, url):
2274                 mobj = re.match(self._VALID_URL, url)
2275                 if mobj is None:
2276                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2277                         return
2278                 showName = mobj.group('showname')
2279                 videoId = mobj.group('episode')
2280
2281                 self.report_extraction(showName)
2282                 try:
2283                         webPageBytes = urllib2.urlopen(url).read()
2284                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2285                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2286                         return
2287
2288                 webPage = webPageBytes.decode('utf-8')
2289                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2290                 description = unescapeHTML(descMatch.group(1))
2291                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2292                 imgUrl = unescapeHTML(imgMatch.group(1))
2293                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2294                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2295                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2296                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2297
2298                 self.report_config_download(showName)
2299                 try:
2300                         configJSON = urllib2.urlopen(configUrl).read()
2301                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2302                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2303                         return
2304
2305                 # Technically, it's JavaScript, not JSON
2306                 configJSON = configJSON.replace("'", '"')
2307
2308                 try:
2309                         config = json.loads(configJSON)
2310                 except (ValueError,), err:
2311                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2312                         return
2313
2314                 playlist = config['playlist']
2315                 videoUrl = playlist[1]['url']
2316
2317                 info = {
2318                         'id': videoId,
2319                         'url': videoUrl,
2320                         'uploader': showName,
2321                         'upload_date': None,
2322                         'title': showName,
2323                         'ext': 'flv',
2324                         'format': 'flv',
2325                         'thumbnail': imgUrl,
2326                         'description': description,
2327                         'player_url': playerUrl,
2328                 }
2329
2330                 return [info]
2331
2332
2333 class CollegeHumorIE(InfoExtractor):
2334         """Information extractor for collegehumor.com"""
2335
2336         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2337         IE_NAME = u'collegehumor'
2338
2339         def report_webpage(self, video_id):
2340                 """Report information extraction."""
2341                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2342
2343         def report_extraction(self, video_id):
2344                 """Report information extraction."""
2345                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2346
2347         def _real_extract(self, url):
2348                 mobj = re.match(self._VALID_URL, url)
2349                 if mobj is None:
2350                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2351                         return
2352                 video_id = mobj.group('videoid')
2353
2354                 self.report_webpage(video_id)
2355                 request = urllib2.Request(url)
2356                 try:
2357                         webpage = urllib2.urlopen(request).read()
2358                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2359                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2360                         return
2361
2362                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2363                 if m is None:
2364                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2365                         return
2366                 internal_video_id = m.group('internalvideoid')
2367
2368                 info = {
2369                         'id': video_id,
2370                         'internal_id': internal_video_id,
2371                 }
2372
2373                 self.report_extraction(video_id)
2374                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2375                 try:
2376                         metaXml = urllib2.urlopen(xmlUrl).read()
2377                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2378                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2379                         return
2380
2381                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2382                 try:
2383                         videoNode = mdoc.findall('./video')[0]
2384                         info['description'] = videoNode.findall('./description')[0].text
2385                         info['title'] = videoNode.findall('./caption')[0].text
2386                         info['url'] = videoNode.findall('./file')[0].text
2387                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2388                         info['ext'] = info['url'].rpartition('.')[2]
2389                         info['format'] = info['ext']
2390                 except IndexError:
2391                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2392                         return
2393
2394                 return [info]
2395
2396
2397 class XVideosIE(InfoExtractor):
2398         """Information extractor for xvideos.com"""
2399
2400         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2401         IE_NAME = u'xvideos'
2402
2403         def report_webpage(self, video_id):
2404                 """Report information extraction."""
2405                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2406
2407         def report_extraction(self, video_id):
2408                 """Report information extraction."""
2409                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2410
2411         def _real_extract(self, url):
2412                 mobj = re.match(self._VALID_URL, url)
2413                 if mobj is None:
2414                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2415                         return
2416                 video_id = mobj.group(1).decode('utf-8')
2417
2418                 self.report_webpage(video_id)
2419
2420                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2421                 try:
2422                         webpage = urllib2.urlopen(request).read()
2423                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2424                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2425                         return
2426
2427                 self.report_extraction(video_id)
2428
2429
2430                 # Extract video URL
2431                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2432                 if mobj is None:
2433                         self._downloader.trouble(u'ERROR: unable to extract video url')
2434                         return
2435                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2436
2437
2438                 # Extract title
2439                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2440                 if mobj is None:
2441                         self._downloader.trouble(u'ERROR: unable to extract video title')
2442                         return
2443                 video_title = mobj.group(1).decode('utf-8')
2444
2445
2446                 # Extract video thumbnail
2447                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2448                 if mobj is None:
2449                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2450                         return
2451                 video_thumbnail = mobj.group(1).decode('utf-8')
2452
2453                 info = {
2454                         'id': video_id,
2455                         'url': video_url,
2456                         'uploader': None,
2457                         'upload_date': None,
2458                         'title': video_title,
2459                         'ext': 'flv',
2460                         'format': 'flv',
2461                         'thumbnail': video_thumbnail,
2462                         'description': None,
2463                         'player_url': None,
2464                 }
2465
2466                 return [info]
2467
2468
2469 class SoundcloudIE(InfoExtractor):
2470         """Information extractor for soundcloud.com
2471            To access the media, the uid of the song and a stream token
2472            must be extracted from the page source and the script must make
2473            a request to media.soundcloud.com/crossdomain.xml. Then
2474            the media can be grabbed by requesting from an url composed
2475            of the stream token and uid
2476          """
2477
2478         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2479         IE_NAME = u'soundcloud'
2480
2481         def __init__(self, downloader=None):
2482                 InfoExtractor.__init__(self, downloader)
2483
2484         def report_webpage(self, video_id):
2485                 """Report information extraction."""
2486                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2487
2488         def report_extraction(self, video_id):
2489                 """Report information extraction."""
2490                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2491
2492         def _real_extract(self, url):
2493                 mobj = re.match(self._VALID_URL, url)
2494                 if mobj is None:
2495                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2496                         return
2497
2498                 # extract uploader (which is in the url)
2499                 uploader = mobj.group(1).decode('utf-8')
2500                 # extract simple title (uploader + slug of song title)
2501                 slug_title =  mobj.group(2).decode('utf-8')
2502                 simple_title = uploader + u'-' + slug_title
2503
2504                 self.report_webpage('%s/%s' % (uploader, slug_title))
2505
2506                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2507                 try:
2508                         webpage = urllib2.urlopen(request).read()
2509                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2510                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2511                         return
2512
2513                 self.report_extraction('%s/%s' % (uploader, slug_title))
2514
2515                 # extract uid and stream token that soundcloud hands out for access
2516                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2517                 if mobj:
2518                         video_id = mobj.group(1)
2519                         stream_token = mobj.group(2)
2520
2521                 # extract unsimplified title
2522                 mobj = re.search('"title":"(.*?)",', webpage)
2523                 if mobj:
2524                         title = mobj.group(1).decode('utf-8')
2525                 else:
2526                         title = simple_title
2527
2528                 # construct media url (with uid/token)
2529                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2530                 mediaURL = mediaURL % (video_id, stream_token)
2531
2532                 # description
2533                 description = u'No description available'
2534                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2535                 if mobj:
2536                         description = mobj.group(1)
2537
2538                 # upload date
2539                 upload_date = None
2540                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2541                 if mobj:
2542                         try:
2543                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2544                         except Exception, e:
2545                                 self._downloader.to_stderr(str(e))
2546
2547                 # for soundcloud, a request to a cross domain is required for cookies
2548                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2549
2550                 return [{
2551                         'id':           video_id.decode('utf-8'),
2552                         'url':          mediaURL,
2553                         'uploader':     uploader.decode('utf-8'),
2554                         'upload_date':  upload_date,
2555                         'title':        title,
2556                         'ext':          u'mp3',
2557                         'format':       u'NA',
2558                         'player_url':   None,
2559                         'description': description.decode('utf-8')
2560                 }]
2561
2562
2563 class InfoQIE(InfoExtractor):
2564         """Information extractor for infoq.com"""
2565
2566         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2567         IE_NAME = u'infoq'
2568
2569         def report_webpage(self, video_id):
2570                 """Report information extraction."""
2571                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2572
2573         def report_extraction(self, video_id):
2574                 """Report information extraction."""
2575                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2576
2577         def _real_extract(self, url):
2578                 mobj = re.match(self._VALID_URL, url)
2579                 if mobj is None:
2580                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2581                         return
2582
2583                 self.report_webpage(url)
2584
2585                 request = urllib2.Request(url)
2586                 try:
2587                         webpage = urllib2.urlopen(request).read()
2588                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2589                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2590                         return
2591
2592                 self.report_extraction(url)
2593
2594
2595                 # Extract video URL
2596                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2597                 if mobj is None:
2598                         self._downloader.trouble(u'ERROR: unable to extract video url')
2599                         return
2600                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2601
2602
2603                 # Extract title
2604                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2605                 if mobj is None:
2606                         self._downloader.trouble(u'ERROR: unable to extract video title')
2607                         return
2608                 video_title = mobj.group(1).decode('utf-8')
2609
2610                 # Extract description
2611                 video_description = u'No description available.'
2612                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2613                 if mobj is not None:
2614                         video_description = mobj.group(1).decode('utf-8')
2615
2616                 video_filename = video_url.split('/')[-1]
2617                 video_id, extension = video_filename.split('.')
2618
2619                 info = {
2620                         'id': video_id,
2621                         'url': video_url,
2622                         'uploader': None,
2623                         'upload_date': None,
2624                         'title': video_title,
2625                         'ext': extension,
2626                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2627                         'thumbnail': None,
2628                         'description': video_description,
2629                         'player_url': None,
2630                 }
2631
2632                 return [info]
2633
2634 class MixcloudIE(InfoExtractor):
2635         """Information extractor for www.mixcloud.com"""
2636         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2637         IE_NAME = u'mixcloud'
2638
2639         def __init__(self, downloader=None):
2640                 InfoExtractor.__init__(self, downloader)
2641
2642         def report_download_json(self, file_id):
2643                 """Report JSON download."""
2644                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2645
2646         def report_extraction(self, file_id):
2647                 """Report information extraction."""
2648                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2649
2650         def get_urls(self, jsonData, fmt, bitrate='best'):
2651                 """Get urls from 'audio_formats' section in json"""
2652                 file_url = None
2653                 try:
2654                         bitrate_list = jsonData[fmt]
2655                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2656                                 bitrate = max(bitrate_list) # select highest
2657
2658                         url_list = jsonData[fmt][bitrate]
2659                 except TypeError: # we have no bitrate info.
2660                         url_list = jsonData[fmt]
2661                 return url_list
2662
2663         def check_urls(self, url_list):
2664                 """Returns 1st active url from list"""
2665                 for url in url_list:
2666                         try:
2667                                 urllib2.urlopen(url)
2668                                 return url
2669                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2670                                 url = None
2671
2672                 return None
2673
2674         def _print_formats(self, formats):
2675                 print 'Available formats:'
2676                 for fmt in formats.keys():
2677                         for b in formats[fmt]:
2678                                 try:
2679                                         ext = formats[fmt][b][0]
2680                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2681                                 except TypeError: # we have no bitrate info
2682                                         ext = formats[fmt][0]
2683                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2684                                         break
2685
2686         def _real_extract(self, url):
2687                 mobj = re.match(self._VALID_URL, url)
2688                 if mobj is None:
2689                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2690                         return
2691                 # extract uploader & filename from url
2692                 uploader = mobj.group(1).decode('utf-8')
2693                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2694
2695                 # construct API request
2696                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2697                 # retrieve .json file with links to files
2698                 request = urllib2.Request(file_url)
2699                 try:
2700                         self.report_download_json(file_url)
2701                         jsonData = urllib2.urlopen(request).read()
2702                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2703                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2704                         return
2705
2706                 # parse JSON
2707                 json_data = json.loads(jsonData)
2708                 player_url = json_data['player_swf_url']
2709                 formats = dict(json_data['audio_formats'])
2710
2711                 req_format = self._downloader.params.get('format', None)
2712                 bitrate = None
2713
2714                 if self._downloader.params.get('listformats', None):
2715                         self._print_formats(formats)
2716                         return
2717
2718                 if req_format is None or req_format == 'best':
2719                         for format_param in formats.keys():
2720                                 url_list = self.get_urls(formats, format_param)
2721                                 # check urls
2722                                 file_url = self.check_urls(url_list)
2723                                 if file_url is not None:
2724                                         break # got it!
2725                 else:
2726                         if req_format not in formats.keys():
2727                                 self._downloader.trouble(u'ERROR: format is not available')
2728                                 return
2729
2730                         url_list = self.get_urls(formats, req_format)
2731                         file_url = self.check_urls(url_list)
2732                         format_param = req_format
2733
2734                 return [{
2735                         'id': file_id.decode('utf-8'),
2736                         'url': file_url.decode('utf-8'),
2737                         'uploader':     uploader.decode('utf-8'),
2738                         'upload_date': u'NA',
2739                         'title': json_data['name'],
2740                         'ext': file_url.split('.')[-1].decode('utf-8'),
2741                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2742                         'thumbnail': json_data['thumbnail_url'],
2743                         'description': json_data['description'],
2744                         'player_url': player_url.decode('utf-8'),
2745                 }]
2746
2747 class StanfordOpenClassroomIE(InfoExtractor):
2748         """Information extractor for Stanford's Open ClassRoom"""
2749
2750         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2751         IE_NAME = u'stanfordoc'
2752
2753         def report_download_webpage(self, objid):
2754                 """Report information extraction."""
2755                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2756
2757         def report_extraction(self, video_id):
2758                 """Report information extraction."""
2759                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2760
2761         def _real_extract(self, url):
2762                 mobj = re.match(self._VALID_URL, url)
2763                 if mobj is None:
2764                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2765                         return
2766
2767                 if mobj.group('course') and mobj.group('video'): # A specific video
2768                         course = mobj.group('course')
2769                         video = mobj.group('video')
2770                         info = {
2771                                 'id': course + '_' + video,
2772                         }
2773
2774                         self.report_extraction(info['id'])
2775                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2776                         xmlUrl = baseUrl + video + '.xml'
2777                         try:
2778                                 metaXml = urllib2.urlopen(xmlUrl).read()
2779                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2780                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2781                                 return
2782                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2783                         try:
2784                                 info['title'] = mdoc.findall('./title')[0].text
2785                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2786                         except IndexError:
2787                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2788                                 return
2789                         info['ext'] = info['url'].rpartition('.')[2]
2790                         info['format'] = info['ext']
2791                         return [info]
2792                 elif mobj.group('course'): # A course page
2793                         course = mobj.group('course')
2794                         info = {
2795                                 'id': course,
2796                                 'type': 'playlist',
2797                         }
2798
2799                         self.report_download_webpage(info['id'])
2800                         try:
2801                                 coursepage = urllib2.urlopen(url).read()
2802                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2803                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2804                                 return
2805
2806                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2807                         if m:
2808                                 info['title'] = unescapeHTML(m.group(1))
2809                         else:
2810                                 info['title'] = info['id']
2811
2812                         m = re.search('<description>([^<]+)</description>', coursepage)
2813                         if m:
2814                                 info['description'] = unescapeHTML(m.group(1))
2815
2816                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2817                         info['list'] = [
2818                                 {
2819                                         'type': 'reference',
2820                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2821                                 }
2822                                         for vpage in links]
2823                         results = []
2824                         for entry in info['list']:
2825                                 assert entry['type'] == 'reference'
2826                                 results += self.extract(entry['url'])
2827                         return results
2828
2829                 else: # Root page
2830                         info = {
2831                                 'id': 'Stanford OpenClassroom',
2832                                 'type': 'playlist',
2833                         }
2834
2835                         self.report_download_webpage(info['id'])
2836                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2837                         try:
2838                                 rootpage = urllib2.urlopen(rootURL).read()
2839                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2840                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2841                                 return
2842
2843                         info['title'] = info['id']
2844
2845                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2846                         info['list'] = [
2847                                 {
2848                                         'type': 'reference',
2849                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2850                                 }
2851                                         for cpage in links]
2852
2853                         results = []
2854                         for entry in info['list']:
2855                                 assert entry['type'] == 'reference'
2856                                 results += self.extract(entry['url'])
2857                         return results
2858
2859 class MTVIE(InfoExtractor):
2860         """Information extractor for MTV.com"""
2861
2862         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2863         IE_NAME = u'mtv'
2864
2865         def report_webpage(self, video_id):
2866                 """Report information extraction."""
2867                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2868
2869         def report_extraction(self, video_id):
2870                 """Report information extraction."""
2871                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2872
2873         def _real_extract(self, url):
2874                 mobj = re.match(self._VALID_URL, url)
2875                 if mobj is None:
2876                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2877                         return
2878                 if not mobj.group('proto'):
2879                         url = 'http://' + url
2880                 video_id = mobj.group('videoid')
2881                 self.report_webpage(video_id)
2882
2883                 request = urllib2.Request(url)
2884                 try:
2885                         webpage = urllib2.urlopen(request).read()
2886                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2887                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2888                         return
2889
2890                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2891                 if mobj is None:
2892                         self._downloader.trouble(u'ERROR: unable to extract song name')
2893                         return
2894                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2895                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2896                 if mobj is None:
2897                         self._downloader.trouble(u'ERROR: unable to extract performer')
2898                         return
2899                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2900                 video_title = performer + ' - ' + song_name
2901
2902                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2903                 if mobj is None:
2904                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2905                         return
2906                 mtvn_uri = mobj.group(1)
2907
2908                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2909                 if mobj is None:
2910                         self._downloader.trouble(u'ERROR: unable to extract content id')
2911                         return
2912                 content_id = mobj.group(1)
2913
2914                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2915                 self.report_extraction(video_id)
2916                 request = urllib2.Request(videogen_url)
2917                 try:
2918                         metadataXml = urllib2.urlopen(request).read()
2919                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2920                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2921                         return
2922
2923                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2924                 renditions = mdoc.findall('.//rendition')
2925
2926                 # For now, always pick the highest quality.
2927                 rendition = renditions[-1]
2928
2929                 try:
2930                         _,_,ext = rendition.attrib['type'].partition('/')
2931                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2932                         video_url = rendition.find('./src').text
2933                 except KeyError:
2934                         self._downloader.trouble('Invalid rendition field.')
2935                         return
2936
2937                 info = {
2938                         'id': video_id,
2939                         'url': video_url,
2940                         'uploader': performer,
2941                         'title': video_title,
2942                         'ext': ext,
2943                         'format': format,
2944                 }
2945
2946                 return [info]