youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15
  16 try:
  17         import cStringIO as StringIO
  18 except ImportError:
  19         import StringIO
  20
  21 # parse_qs was moved from the cgi module to the urlparse module recently.
  22 try:
  23         from urlparse import parse_qs
  24 except ImportError:
  25         from cgi import parse_qs
  26
  27 try:
  28         import lxml.etree
  29 except ImportError:
  30         pass # Handled below
  31
  32 try:
  33         import xml.etree.ElementTree
  34 except ImportError: # Python<2.5: Not officially supported, but let it slip
  35         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  36
  37 from Utils import *
  38
  39
  40 class InfoExtractor(object):
  41         """Information Extractor class.
  42
  43         Information extractors are the classes that, given a URL, extract
  44         information from the video (or videos) the URL refers to. This
  45         information includes the real video URL, the video title and simplified
  46         title, author and others. The information is stored in a dictionary
  47         which is then passed to the FileDownloader. The FileDownloader
  48         processes this information possibly downloading the video to the file
  49         system, among other possible outcomes. The dictionaries must include
  50         the following fields:
  51
  52         id:             Video identifier.
  53         url:            Final video URL.
  54         uploader:       Nickname of the video uploader.
  55         title:          Literal title.
  56         stitle:         Simplified title.
  57         ext:            Video filename extension.
  58         format:         Video format.
  59         player_url:     SWF Player URL (may be None).
  60
  61         The following fields are optional. Their primary purpose is to allow
  62         youtube-dl to serve as the backend for a video search function, such
  63         as the one in youtube2mp3.  They are only used when their respective
  64         forced printing functions are called:
  65
  66         thumbnail:      Full URL to a video thumbnail image.
  67         description:    One-line video description.
  68
  69         Subclasses of this one should re-define the _real_initialize() and
  70         _real_extract() methods and define a _VALID_URL regexp.
  71         Probably, they should also be added to the list of extractors.
  72         """
  73
  74         _ready = False
  75         _downloader = None
  76
  77         def __init__(self, downloader=None):
  78                 """Constructor. Receives an optional downloader."""
  79                 self._ready = False
  80                 self.set_downloader(downloader)
  81
  82         def suitable(self, url):
  83                 """Receives a URL and returns True if suitable for this IE."""
  84                 return re.match(self._VALID_URL, url) is not None
  85
  86         def initialize(self):
  87                 """Initializes an instance (authentication, etc)."""
  88                 if not self._ready:
  89                         self._real_initialize()
  90                         self._ready = True
  91
  92         def extract(self, url):
  93                 """Extracts URL information and returns it in list of dicts."""
  94                 self.initialize()
  95                 return self._real_extract(url)
  96
  97         def set_downloader(self, downloader):
  98                 """Sets the downloader for this IE."""
  99                 self._downloader = downloader
 100
 101         def _real_initialize(self):
 102                 """Real initialization process. Redefine in subclasses."""
 103                 pass
 104
 105         def _real_extract(self, url):
 106                 """Real extraction process. Redefine in subclasses."""
 107                 pass
 108
 109
 110 class YoutubeIE(InfoExtractor):
 111         """Information extractor for youtube.com."""
 112
 113         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
 114         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 115         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 116         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 117         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 118         _NETRC_MACHINE = 'youtube'
 119         # Listed in order of quality
 120         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 121         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 122         _video_extensions = {
 123                 '13': '3gp',
 124                 '17': 'mp4',
 125                 '18': 'mp4',
 126                 '22': 'mp4',
 127                 '37': 'mp4',
 128                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 129                 '43': 'webm',
 130                 '44': 'webm',
 131                 '45': 'webm',
 132         }
 133         _video_dimensions = {
 134                 '5': '240x400',
 135                 '6': '???',
 136                 '13': '???',
 137                 '17': '144x176',
 138                 '18': '360x640',
 139                 '22': '720x1280',
 140                 '34': '360x640',
 141                 '35': '480x854',
 142                 '37': '1080x1920',
 143                 '38': '3072x4096',
 144                 '43': '360x640',
 145                 '44': '480x854',
 146                 '45': '720x1280',
 147         }
 148         IE_NAME = u'youtube'
 149
 150         def report_lang(self):
 151                 """Report attempt to set language."""
 152                 self._downloader.to_screen(u'[youtube] Setting language')
 153
 154         def report_login(self):
 155                 """Report attempt to log in."""
 156                 self._downloader.to_screen(u'[youtube] Logging in')
 157
 158         def report_age_confirmation(self):
 159                 """Report attempt to confirm age."""
 160                 self._downloader.to_screen(u'[youtube] Confirming age')
 161
 162         def report_video_webpage_download(self, video_id):
 163                 """Report attempt to download video webpage."""
 164                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 165
 166         def report_video_info_webpage_download(self, video_id):
 167                 """Report attempt to download video info webpage."""
 168                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 169
 170         def report_video_subtitles_download(self, video_id):
 171                 """Report attempt to download video info webpage."""
 172                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 173
 174         def report_information_extraction(self, video_id):
 175                 """Report attempt to extract video information."""
 176                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 177
 178         def report_unavailable_format(self, video_id, format):
 179                 """Report extracted video URL."""
 180                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 181
 182         def report_rtmp_download(self):
 183                 """Indicate the download will use the RTMP protocol."""
 184                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 185
 186         def _closed_captions_xml_to_srt(self, xml_string):
 187                 srt = ''
 188                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 189                 # TODO parse xml instead of regex
 190                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 191                         if not dur: dur = '4'
 192                         start = float(start)
 193                         end = start + float(dur)
 194                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 195                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 196                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
 197                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
 198                         srt += str(n) + '\n'
 199                         srt += start + ' --> ' + end + '\n'
 200                         srt += caption + '\n\n'
 201                 return srt
 202
 203         def _print_formats(self, formats):
 204                 print 'Available formats:'
 205                 for x in formats:
 206                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 207
 208         def _real_initialize(self):
 209                 if self._downloader is None:
 210                         return
 211
 212                 username = None
 213                 password = None
 214                 downloader_params = self._downloader.params
 215
 216                 # Attempt to use provided username and password or .netrc data
 217                 if downloader_params.get('username', None) is not None:
 218                         username = downloader_params['username']
 219                         password = downloader_params['password']
 220                 elif downloader_params.get('usenetrc', False):
 221                         try:
 222                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 223                                 if info is not None:
 224                                         username = info[0]
 225                                         password = info[2]
 226                                 else:
 227                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 228                         except (IOError, netrc.NetrcParseError), err:
 229                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 230                                 return
 231
 232                 # Set language
 233                 request = urllib2.Request(self._LANG_URL)
 234                 try:
 235                         self.report_lang()
 236                         urllib2.urlopen(request).read()
 237                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 238                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 239                         return
 240
 241                 # No authentication to be performed
 242                 if username is None:
 243                         return
 244
 245                 # Log in
 246                 login_form = {
 247                                 'current_form': 'loginForm',
 248                                 'next':         '/',
 249                                 'action_login': 'Log In',
 250                                 'username':     username,
 251                                 'password':     password,
 252                                 }
 253                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 254                 try:
 255                         self.report_login()
 256                         login_results = urllib2.urlopen(request).read()
 257                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 258                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 259                                 return
 260                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 261                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 262                         return
 263
 264                 # Confirm age
 265                 age_form = {
 266                                 'next_url':             '/',
 267                                 'action_confirm':       'Confirm',
 268                                 }
 269                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 270                 try:
 271                         self.report_age_confirmation()
 272                         age_results = urllib2.urlopen(request).read()
 273                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 274                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 275                         return
 276
 277         def _real_extract(self, url):
 278                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 279                 mobj = re.search(self._NEXT_URL_RE, url)
 280                 if mobj:
 281                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 282
 283                 # Extract video id from URL
 284                 mobj = re.match(self._VALID_URL, url)
 285                 if mobj is None:
 286                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 287                         return
 288                 video_id = mobj.group(2)
 289
 290                 # Get video webpage
 291                 self.report_video_webpage_download(video_id)
 292                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 293                 try:
 294                         video_webpage = urllib2.urlopen(request).read()
 295                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 296                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 297                         return
 298
 299                 # Attempt to extract SWF player URL
 300                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 301                 if mobj is not None:
 302                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 303                 else:
 304                         player_url = None
 305
 306                 # Get video info
 307                 self.report_video_info_webpage_download(video_id)
 308                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 309                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 310                                         % (video_id, el_type))
 311                         request = urllib2.Request(video_info_url)
 312                         try:
 313                                 video_info_webpage = urllib2.urlopen(request).read()
 314                                 video_info = parse_qs(video_info_webpage)
 315                                 if 'token' in video_info:
 316                                         break
 317                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 318                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 319                                 return
 320                 if 'token' not in video_info:
 321                         if 'reason' in video_info:
 322                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 323                         else:
 324                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 325                         return
 326
 327                 # Start extracting information
 328                 self.report_information_extraction(video_id)
 329
 330                 # uploader
 331                 if 'author' not in video_info:
 332                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 333                         return
 334                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 335
 336                 # title
 337                 if 'title' not in video_info:
 338                         self._downloader.trouble(u'ERROR: unable to extract video title')
 339                         return
 340                 video_title = urllib.unquote_plus(video_info['title'][0])
 341                 video_title = video_title.decode('utf-8')
 342                 video_title = sanitize_title(video_title)
 343
 344                 # simplified title
 345                 simple_title = simplify_title(video_title)
 346
 347                 # thumbnail image
 348                 if 'thumbnail_url' not in video_info:
 349                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 350                         video_thumbnail = ''
 351                 else:   # don't panic if we can't find it
 352                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 353
 354                 # upload date
 355                 upload_date = u'NA'
 356                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 357                 if mobj is not None:
 358                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 359                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 360                         for expression in format_expressions:
 361                                 try:
 362                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 363                                 except:
 364                                         pass
 365
 366                 # description
 367                 try:
 368                         lxml.etree
 369                 except NameError:
 370                         video_description = u'No description available.'
 371                         mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
 372                         if mobj is not None:
 373                                 video_description = mobj.group(1).decode('utf-8')
 374                 else:
 375                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
 376                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
 377                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
 378                         # TODO use another parser
 379
 380                 # closed captions
 381                 video_subtitles = None
 382                 if self._downloader.params.get('writesubtitles', False):
 383                         self.report_video_subtitles_download(video_id)
 384                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 385                         try:
 386                                 srt_list = urllib2.urlopen(request).read()
 387                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 388                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 389                         else:
 390                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
 391                                 if srt_lang_list:
 392                                         if self._downloader.params.get('subtitleslang', False):
 393                                                 srt_lang = self._downloader.params.get('subtitleslang')
 394                                         elif 'en' in srt_lang_list:
 395                                                 srt_lang = 'en'
 396                                         else:
 397                                                 srt_lang = srt_lang_list[0]
 398                                         if not srt_lang in srt_lang_list:
 399                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
 400                                         else:
 401                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
 402                                                 try:
 403                                                         srt_xml = urllib2.urlopen(request).read()
 404                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 405                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 406                                                 else:
 407                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 408                                 else:
 409                                         self._downloader.trouble(u'WARNING: video has no closed captions')
 410
 411                 # token
 412                 video_token = urllib.unquote_plus(video_info['token'][0])
 413
 414                 # Decide which formats to download
 415                 req_format = self._downloader.params.get('format', None)
 416
 417                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 418                         self.report_rtmp_download()
 419                         video_url_list = [(None, video_info['conn'][0])]
 420                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 421                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 422                         url_data = [parse_qs(uds) for uds in url_data_strs]
 423                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 424                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
 425
 426                         format_limit = self._downloader.params.get('format_limit', None)
 427                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 428                         if format_limit is not None and format_limit in available_formats:
 429                                 format_list = available_formats[available_formats.index(format_limit):]
 430                         else:
 431                                 format_list = available_formats
 432                         existing_formats = [x for x in format_list if x in url_map]
 433                         if len(existing_formats) == 0:
 434                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 435                                 return
 436                         if self._downloader.params.get('listformats', None):
 437                                 self._print_formats(existing_formats)
 438                                 return
 439                         if req_format is None or req_format == 'best':
 440                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 441                         elif req_format == 'worst':
 442                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 443                         elif req_format in ('-1', 'all'):
 444                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 445                         else:
 446                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 447                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 448                                 req_formats = req_format.split('/')
 449                                 video_url_list = None
 450                                 for rf in req_formats:
 451                                         if rf in url_map:
 452                                                 video_url_list = [(rf, url_map[rf])]
 453                                                 break
 454                                 if video_url_list is None:
 455                                         self._downloader.trouble(u'ERROR: requested format not available')
 456                                         return
 457                 else:
 458                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 459                         return
 460
 461                 results = []
 462                 for format_param, video_real_url in video_url_list:
 463                         # Extension
 464                         video_extension = self._video_extensions.get(format_param, 'flv')
 465
 466                         results.append({
 467                                 'id':           video_id.decode('utf-8'),
 468                                 'url':          video_real_url.decode('utf-8'),
 469                                 'uploader':     video_uploader.decode('utf-8'),
 470                                 'upload_date':  upload_date,
 471                                 'title':        video_title,
 472                                 'stitle':       simple_title,
 473                                 'ext':          video_extension.decode('utf-8'),
 474                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 475                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 476                                 'description':  video_description,
 477                                 'player_url':   player_url,
 478                                 'subtitles':    video_subtitles
 479                         })
 480                 return results
 481
 482
 483 class MetacafeIE(InfoExtractor):
 484         """Information Extractor for metacafe.com."""
 485
 486         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 487         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 488         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 489         IE_NAME = u'metacafe'
 490
 491         def __init__(self, downloader=None):
 492                 InfoExtractor.__init__(self, downloader)
 493
 494         def report_disclaimer(self):
 495                 """Report disclaimer retrieval."""
 496                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 497
 498         def report_age_confirmation(self):
 499                 """Report attempt to confirm age."""
 500                 self._downloader.to_screen(u'[metacafe] Confirming age')
 501
 502         def report_download_webpage(self, video_id):
 503                 """Report webpage download."""
 504                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 505
 506         def report_extraction(self, video_id):
 507                 """Report information extraction."""
 508                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 509
 510         def _real_initialize(self):
 511                 # Retrieve disclaimer
 512                 request = urllib2.Request(self._DISCLAIMER)
 513                 try:
 514                         self.report_disclaimer()
 515                         disclaimer = urllib2.urlopen(request).read()
 516                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 517                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 518                         return
 519
 520                 # Confirm age
 521                 disclaimer_form = {
 522                         'filters': '0',
 523                         'submit': "Continue - I'm over 18",
 524                         }
 525                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 526                 try:
 527                         self.report_age_confirmation()
 528                         disclaimer = urllib2.urlopen(request).read()
 529                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 530                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 531                         return
 532
 533         def _real_extract(self, url):
 534                 # Extract id and simplified title from URL
 535                 mobj = re.match(self._VALID_URL, url)
 536                 if mobj is None:
 537                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 538                         return
 539
 540                 video_id = mobj.group(1)
 541
 542                 # Check if video comes from YouTube
 543                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 544                 if mobj2 is not None:
 545                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 546                         return
 547
 548                 simple_title = mobj.group(2).decode('utf-8')
 549
 550                 # Retrieve video webpage to extract further information
 551                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 552                 try:
 553                         self.report_download_webpage(video_id)
 554                         webpage = urllib2.urlopen(request).read()
 555                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 556                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 557                         return
 558
 559                 # Extract URL, uploader and title from webpage
 560                 self.report_extraction(video_id)
 561                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 562                 if mobj is not None:
 563                         mediaURL = urllib.unquote(mobj.group(1))
 564                         video_extension = mediaURL[-3:]
 565
 566                         # Extract gdaKey if available
 567                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 568                         if mobj is None:
 569                                 video_url = mediaURL
 570                         else:
 571                                 gdaKey = mobj.group(1)
 572                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 573                 else:
 574                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 575                         if mobj is None:
 576                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 577                                 return
 578                         vardict = parse_qs(mobj.group(1))
 579                         if 'mediaData' not in vardict:
 580                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 581                                 return
 582                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 583                         if mobj is None:
 584                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 585                                 return
 586                         mediaURL = mobj.group(1).replace('\\/', '/')
 587                         video_extension = mediaURL[-3:]
 588                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 589
 590                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 591                 if mobj is None:
 592                         self._downloader.trouble(u'ERROR: unable to extract title')
 593                         return
 594                 video_title = mobj.group(1).decode('utf-8')
 595                 video_title = sanitize_title(video_title)
 596
 597                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 598                 if mobj is None:
 599                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 600                         return
 601                 video_uploader = mobj.group(1)
 602
 603                 return [{
 604                         'id':           video_id.decode('utf-8'),
 605                         'url':          video_url.decode('utf-8'),
 606                         'uploader':     video_uploader.decode('utf-8'),
 607                         'upload_date':  u'NA',
 608                         'title':        video_title,
 609                         'stitle':       simple_title,
 610                         'ext':          video_extension.decode('utf-8'),
 611                         'format':       u'NA',
 612                         'player_url':   None,
 613                 }]
 614
 615
 616 class DailymotionIE(InfoExtractor):
 617         """Information Extractor for Dailymotion"""
 618
 619         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
 620         IE_NAME = u'dailymotion'
 621
 622         def __init__(self, downloader=None):
 623                 InfoExtractor.__init__(self, downloader)
 624
 625         def report_download_webpage(self, video_id):
 626                 """Report webpage download."""
 627                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 628
 629         def report_extraction(self, video_id):
 630                 """Report information extraction."""
 631                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 632
 633         def _real_extract(self, url):
 634                 # Extract id and simplified title from URL
 635                 mobj = re.match(self._VALID_URL, url)
 636                 if mobj is None:
 637                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 638                         return
 639
 640                 video_id = mobj.group(1)
 641
 642                 video_extension = 'flv'
 643
 644                 # Retrieve video webpage to extract further information
 645                 request = urllib2.Request(url)
 646                 request.add_header('Cookie', 'family_filter=off')
 647                 try:
 648                         self.report_download_webpage(video_id)
 649                         webpage = urllib2.urlopen(request).read()
 650                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 651                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 652                         return
 653
 654                 # Extract URL, uploader and title from webpage
 655                 self.report_extraction(video_id)
 656                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
 657                 if mobj is None:
 658                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 659                         return
 660                 sequence = urllib.unquote(mobj.group(1))
 661                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
 662                 if mobj is None:
 663                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 664                         return
 665                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
 666
 667                 # if needed add http://www.dailymotion.com/ if relative URL
 668
 669                 video_url = mediaURL
 670
 671                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 672                 if mobj is None:
 673                         self._downloader.trouble(u'ERROR: unable to extract title')
 674                         return
 675                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 676                 video_title = sanitize_title(video_title)
 677                 simple_title = simplify_title(video_title)
 678
 679                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 680                 if mobj is None:
 681                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 682                         return
 683                 video_uploader = mobj.group(1)
 684
 685                 return [{
 686                         'id':           video_id.decode('utf-8'),
 687                         'url':          video_url.decode('utf-8'),
 688                         'uploader':     video_uploader.decode('utf-8'),
 689                         'upload_date':  u'NA',
 690                         'title':        video_title,
 691                         'stitle':       simple_title,
 692                         'ext':          video_extension.decode('utf-8'),
 693                         'format':       u'NA',
 694                         'player_url':   None,
 695                 }]
 696
 697
 698 class GoogleIE(InfoExtractor):
 699         """Information extractor for video.google.com."""
 700
 701         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 702         IE_NAME = u'video.google'
 703
 704         def __init__(self, downloader=None):
 705                 InfoExtractor.__init__(self, downloader)
 706
 707         def report_download_webpage(self, video_id):
 708                 """Report webpage download."""
 709                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 710
 711         def report_extraction(self, video_id):
 712                 """Report information extraction."""
 713                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 714
 715         def _real_extract(self, url):
 716                 # Extract id from URL
 717                 mobj = re.match(self._VALID_URL, url)
 718                 if mobj is None:
 719                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 720                         return
 721
 722                 video_id = mobj.group(1)
 723
 724                 video_extension = 'mp4'
 725
 726                 # Retrieve video webpage to extract further information
 727                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 728                 try:
 729                         self.report_download_webpage(video_id)
 730                         webpage = urllib2.urlopen(request).read()
 731                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 732                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 733                         return
 734
 735                 # Extract URL, uploader, and title from webpage
 736                 self.report_extraction(video_id)
 737                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 738                 if mobj is None:
 739                         video_extension = 'flv'
 740                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 741                 if mobj is None:
 742                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 743                         return
 744                 mediaURL = urllib.unquote(mobj.group(1))
 745                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 746                 mediaURL = mediaURL.replace('\\x26', '\x26')
 747
 748                 video_url = mediaURL
 749
 750                 mobj = re.search(r'<title>(.*)</title>', webpage)
 751                 if mobj is None:
 752                         self._downloader.trouble(u'ERROR: unable to extract title')
 753                         return
 754                 video_title = mobj.group(1).decode('utf-8')
 755                 video_title = sanitize_title(video_title)
 756                 simple_title = simplify_title(video_title)
 757
 758                 # Extract video description
 759                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 760                 if mobj is None:
 761                         self._downloader.trouble(u'ERROR: unable to extract video description')
 762                         return
 763                 video_description = mobj.group(1).decode('utf-8')
 764                 if not video_description:
 765                         video_description = 'No description available.'
 766
 767                 # Extract video thumbnail
 768                 if self._downloader.params.get('forcethumbnail', False):
 769                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 770                         try:
 771                                 webpage = urllib2.urlopen(request).read()
 772                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 773                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 774                                 return
 775                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 776                         if mobj is None:
 777                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 778                                 return
 779                         video_thumbnail = mobj.group(1)
 780                 else:   # we need something to pass to process_info
 781                         video_thumbnail = ''
 782
 783                 return [{
 784                         'id':           video_id.decode('utf-8'),
 785                         'url':          video_url.decode('utf-8'),
 786                         'uploader':     u'NA',
 787                         'upload_date':  u'NA',
 788                         'title':        video_title,
 789                         'stitle':       simple_title,
 790                         'ext':          video_extension.decode('utf-8'),
 791                         'format':       u'NA',
 792                         'player_url':   None,
 793                 }]
 794
 795
 796 class PhotobucketIE(InfoExtractor):
 797         """Information extractor for photobucket.com."""
 798
 799         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 800         IE_NAME = u'photobucket'
 801
 802         def __init__(self, downloader=None):
 803                 InfoExtractor.__init__(self, downloader)
 804
 805         def report_download_webpage(self, video_id):
 806                 """Report webpage download."""
 807                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 808
 809         def report_extraction(self, video_id):
 810                 """Report information extraction."""
 811                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 812
 813         def _real_extract(self, url):
 814                 # Extract id from URL
 815                 mobj = re.match(self._VALID_URL, url)
 816                 if mobj is None:
 817                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 818                         return
 819
 820                 video_id = mobj.group(1)
 821
 822                 video_extension = 'flv'
 823
 824                 # Retrieve video webpage to extract further information
 825                 request = urllib2.Request(url)
 826                 try:
 827                         self.report_download_webpage(video_id)
 828                         webpage = urllib2.urlopen(request).read()
 829                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 830                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 831                         return
 832
 833                 # Extract URL, uploader, and title from webpage
 834                 self.report_extraction(video_id)
 835                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 836                 if mobj is None:
 837                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 838                         return
 839                 mediaURL = urllib.unquote(mobj.group(1))
 840
 841                 video_url = mediaURL
 842
 843                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 844                 if mobj is None:
 845                         self._downloader.trouble(u'ERROR: unable to extract title')
 846                         return
 847                 video_title = mobj.group(1).decode('utf-8')
 848                 video_title = sanitize_title(video_title)
 849                 simple_title = simplify_title(video_title)
 850
 851                 video_uploader = mobj.group(2).decode('utf-8')
 852
 853                 return [{
 854                         'id':           video_id.decode('utf-8'),
 855                         'url':          video_url.decode('utf-8'),
 856                         'uploader':     video_uploader,
 857                         'upload_date':  u'NA',
 858                         'title':        video_title,
 859                         'stitle':       simple_title,
 860                         'ext':          video_extension.decode('utf-8'),
 861                         'format':       u'NA',
 862                         'player_url':   None,
 863                 }]
 864
 865
 866 class YahooIE(InfoExtractor):
 867         """Information extractor for video.yahoo.com."""
 868
 869         # _VALID_URL matches all Yahoo! Video URLs
 870         # _VPAGE_URL matches only the extractable '/watch/' URLs
 871         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 872         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 873         IE_NAME = u'video.yahoo'
 874
 875         def __init__(self, downloader=None):
 876                 InfoExtractor.__init__(self, downloader)
 877
 878         def report_download_webpage(self, video_id):
 879                 """Report webpage download."""
 880                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 881
 882         def report_extraction(self, video_id):
 883                 """Report information extraction."""
 884                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 885
 886         def _real_extract(self, url, new_video=True):
 887                 # Extract ID from URL
 888                 mobj = re.match(self._VALID_URL, url)
 889                 if mobj is None:
 890                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 891                         return
 892
 893                 video_id = mobj.group(2)
 894                 video_extension = 'flv'
 895
 896                 # Rewrite valid but non-extractable URLs as
 897                 # extractable English language /watch/ URLs
 898                 if re.match(self._VPAGE_URL, url) is None:
 899                         request = urllib2.Request(url)
 900                         try:
 901                                 webpage = urllib2.urlopen(request).read()
 902                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 903                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 904                                 return
 905
 906                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 907                         if mobj is None:
 908                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 909                                 return
 910                         yahoo_id = mobj.group(1)
 911
 912                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 913                         if mobj is None:
 914                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 915                                 return
 916                         yahoo_vid = mobj.group(1)
 917
 918                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 919                         return self._real_extract(url, new_video=False)
 920
 921                 # Retrieve video webpage to extract further information
 922                 request = urllib2.Request(url)
 923                 try:
 924                         self.report_download_webpage(video_id)
 925                         webpage = urllib2.urlopen(request).read()
 926                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 927                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 928                         return
 929
 930                 # Extract uploader and title from webpage
 931                 self.report_extraction(video_id)
 932                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 933                 if mobj is None:
 934                         self._downloader.trouble(u'ERROR: unable to extract video title')
 935                         return
 936                 video_title = mobj.group(1).decode('utf-8')
 937                 simple_title = simplify_title(video_title)
 938
 939                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 940                 if mobj is None:
 941                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 942                         return
 943                 video_uploader = mobj.group(1).decode('utf-8')
 944
 945                 # Extract video thumbnail
 946                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 947                 if mobj is None:
 948                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 949                         return
 950                 video_thumbnail = mobj.group(1).decode('utf-8')
 951
 952                 # Extract video description
 953                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 954                 if mobj is None:
 955                         self._downloader.trouble(u'ERROR: unable to extract video description')
 956                         return
 957                 video_description = mobj.group(1).decode('utf-8')
 958                 if not video_description:
 959                         video_description = 'No description available.'
 960
 961                 # Extract video height and width
 962                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 963                 if mobj is None:
 964                         self._downloader.trouble(u'ERROR: unable to extract video height')
 965                         return
 966                 yv_video_height = mobj.group(1)
 967
 968                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 969                 if mobj is None:
 970                         self._downloader.trouble(u'ERROR: unable to extract video width')
 971                         return
 972                 yv_video_width = mobj.group(1)
 973
 974                 # Retrieve video playlist to extract media URL
 975                 # I'm not completely sure what all these options are, but we
 976                 # seem to need most of them, otherwise the server sends a 401.
 977                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 978                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 979                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 980                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 981                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 982                 try:
 983                         self.report_download_webpage(video_id)
 984                         webpage = urllib2.urlopen(request).read()
 985                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 986                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 987                         return
 988
 989                 # Extract media URL from playlist XML
 990                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 991                 if mobj is None:
 992                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 993                         return
 994                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 995                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
 996
 997                 return [{
 998                         'id':           video_id.decode('utf-8'),
 999                         'url':          video_url,
1000                         'uploader':     video_uploader,
1001                         'upload_date':  u'NA',
1002                         'title':        video_title,
1003                         'stitle':       simple_title,
1004                         'ext':          video_extension.decode('utf-8'),
1005                         'thumbnail':    video_thumbnail.decode('utf-8'),
1006                         'description':  video_description,
1007                         'thumbnail':    video_thumbnail,
1008                         'player_url':   None,
1009                 }]
1010
1011
1012 class VimeoIE(InfoExtractor):
1013         """Information extractor for vimeo.com."""
1014
1015         # _VALID_URL matches Vimeo URLs
1016         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1017         IE_NAME = u'vimeo'
1018
1019         def __init__(self, downloader=None):
1020                 InfoExtractor.__init__(self, downloader)
1021
1022         def report_download_webpage(self, video_id):
1023                 """Report webpage download."""
1024                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1025
1026         def report_extraction(self, video_id):
1027                 """Report information extraction."""
1028                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1029
1030         def _real_extract(self, url, new_video=True):
1031                 # Extract ID from URL
1032                 mobj = re.match(self._VALID_URL, url)
1033                 if mobj is None:
1034                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1035                         return
1036
1037                 video_id = mobj.group(1)
1038
1039                 # Retrieve video webpage to extract further information
1040                 request = urllib2.Request(url, None, std_headers)
1041                 try:
1042                         self.report_download_webpage(video_id)
1043                         webpage = urllib2.urlopen(request).read()
1044                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1045                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1046                         return
1047
1048                 # Now we begin extracting as much information as we can from what we
1049                 # retrieved. First we extract the information common to all extractors,
1050                 # and latter we extract those that are Vimeo specific.
1051                 self.report_extraction(video_id)
1052
1053                 # Extract the config JSON
1054                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1055                 try:
1056                         config = json.loads(config)
1057                 except:
1058                         self._downloader.trouble(u'ERROR: unable to extract info section')
1059                         return
1060
1061                 # Extract title
1062                 video_title = config["video"]["title"]
1063                 simple_title = simplify_title(video_title)
1064
1065                 # Extract uploader
1066                 video_uploader = config["video"]["owner"]["name"]
1067
1068                 # Extract video thumbnail
1069                 video_thumbnail = config["video"]["thumbnail"]
1070
1071                 # Extract video description
1072                 try:
1073                         lxml.etree
1074                 except NameError:
1075                         video_description = u'No description available.'
1076                         mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
1077                         if mobj is not None:
1078                                 video_description = mobj.group(1)
1079                 else:
1080                         html_parser = lxml.etree.HTMLParser()
1081                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
1082                         video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
1083                         # TODO use another parser
1084
1085                 # Extract upload date
1086                 video_upload_date = u'NA'
1087                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1088                 if mobj is not None:
1089                         video_upload_date = mobj.group(1)
1090
1091                 # Vimeo specific: extract request signature and timestamp
1092                 sig = config['request']['signature']
1093                 timestamp = config['request']['timestamp']
1094
1095                 # Vimeo specific: extract video codec and quality information
1096                 # TODO bind to format param
1097                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1098                 for codec in codecs:
1099                         if codec[0] in config["video"]["files"]:
1100                                 video_codec = codec[0]
1101                                 video_extension = codec[1]
1102                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1103                                 else: quality = 'sd'
1104                                 break
1105                 else:
1106                         self._downloader.trouble(u'ERROR: no known codec found')
1107                         return
1108
1109                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1110                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1111
1112                 return [{
1113                         'id':           video_id,
1114                         'url':          video_url,
1115                         'uploader':     video_uploader,
1116                         'upload_date':  video_upload_date,
1117                         'title':        video_title,
1118                         'stitle':       simple_title,
1119                         'ext':          video_extension,
1120                         'thumbnail':    video_thumbnail,
1121                         'description':  video_description,
1122                         'player_url':   None,
1123                 }]
1124
1125
1126 class GenericIE(InfoExtractor):
1127         """Generic last-resort information extractor."""
1128
1129         _VALID_URL = r'.*'
1130         IE_NAME = u'generic'
1131
1132         def __init__(self, downloader=None):
1133                 InfoExtractor.__init__(self, downloader)
1134
1135         def report_download_webpage(self, video_id):
1136                 """Report webpage download."""
1137                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1138                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1139
1140         def report_extraction(self, video_id):
1141                 """Report information extraction."""
1142                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1143
1144         def report_following_redirect(self, new_url):
1145                 """Report information extraction."""
1146                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1147
1148         def _test_redirect(self, url):
1149                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1150                 class HeadRequest(urllib2.Request):
1151                         def get_method(self):
1152                                 return "HEAD"
1153
1154                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1155                         """
1156                         Subclass the HTTPRedirectHandler to make it use our
1157                         HeadRequest also on the redirected URL
1158                         """
1159                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1160                                 if code in (301, 302, 303, 307):
1161                                         newurl = newurl.replace(' ', '%20')
1162                                         newheaders = dict((k,v) for k,v in req.headers.items()
1163                                                                           if k.lower() not in ("content-length", "content-type"))
1164                                         return HeadRequest(newurl,
1165                                                                            headers=newheaders,
1166                                                                            origin_req_host=req.get_origin_req_host(),
1167                                                                            unverifiable=True)
1168                                 else:
1169                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1170
1171                 class HTTPMethodFallback(urllib2.BaseHandler):
1172                         """
1173                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1174                         """
1175                         def http_error_405(self, req, fp, code, msg, headers):
1176                                 fp.read()
1177                                 fp.close()
1178
1179                                 newheaders = dict((k,v) for k,v in req.headers.items()
1180                                                                   if k.lower() not in ("content-length", "content-type"))
1181                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1182                                                                                                  headers=newheaders,
1183                                                                                                  origin_req_host=req.get_origin_req_host(),
1184                                                                                                  unverifiable=True))
1185
1186                 # Build our opener
1187                 opener = urllib2.OpenerDirector()
1188                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1189                                                 HTTPMethodFallback, HEADRedirectHandler,
1190                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1191                         opener.add_handler(handler())
1192
1193                 response = opener.open(HeadRequest(url))
1194                 new_url = response.geturl()
1195
1196                 if url == new_url: return False
1197
1198                 self.report_following_redirect(new_url)
1199                 self._downloader.download([new_url])
1200                 return True
1201
1202         def _real_extract(self, url):
1203                 if self._test_redirect(url): return
1204
1205                 video_id = url.split('/')[-1]
1206                 request = urllib2.Request(url)
1207                 try:
1208                         self.report_download_webpage(video_id)
1209                         webpage = urllib2.urlopen(request).read()
1210                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1211                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1212                         return
1213                 except ValueError, err:
1214                         # since this is the last-resort InfoExtractor, if
1215                         # this error is thrown, it'll be thrown here
1216                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1217                         return
1218
1219                 self.report_extraction(video_id)
1220                 # Start with something easy: JW Player in SWFObject
1221                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1222                 if mobj is None:
1223                         # Broaden the search a little bit
1224                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1225                 if mobj is None:
1226                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1227                         return
1228
1229                 # It's possible that one of the regexes
1230                 # matched, but returned an empty group:
1231                 if mobj.group(1) is None:
1232                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1233                         return
1234
1235                 video_url = urllib.unquote(mobj.group(1))
1236                 video_id = os.path.basename(video_url)
1237
1238                 # here's a fun little line of code for you:
1239                 video_extension = os.path.splitext(video_id)[1][1:]
1240                 video_id = os.path.splitext(video_id)[0]
1241
1242                 # it's tempting to parse this further, but you would
1243                 # have to take into account all the variations like
1244                 #   Video Title - Site Name
1245                 #   Site Name | Video Title
1246                 #   Video Title - Tagline | Site Name
1247                 # and so on and so forth; it's just not practical
1248                 mobj = re.search(r'<title>(.*)</title>', webpage)
1249                 if mobj is None:
1250                         self._downloader.trouble(u'ERROR: unable to extract title')
1251                         return
1252                 video_title = mobj.group(1).decode('utf-8')
1253                 video_title = sanitize_title(video_title)
1254                 simple_title = simplify_title(video_title)
1255
1256                 # video uploader is domain name
1257                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1258                 if mobj is None:
1259                         self._downloader.trouble(u'ERROR: unable to extract title')
1260                         return
1261                 video_uploader = mobj.group(1).decode('utf-8')
1262
1263                 return [{
1264                         'id':           video_id.decode('utf-8'),
1265                         'url':          video_url.decode('utf-8'),
1266                         'uploader':     video_uploader,
1267                         'upload_date':  u'NA',
1268                         'title':        video_title,
1269                         'stitle':       simple_title,
1270                         'ext':          video_extension.decode('utf-8'),
1271                         'format':       u'NA',
1272                         'player_url':   None,
1273                 }]
1274
1275
1276 class YoutubeSearchIE(InfoExtractor):
1277         """Information Extractor for YouTube search queries."""
1278         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1279         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1280         _max_youtube_results = 1000
1281         IE_NAME = u'youtube:search'
1282
1283         def __init__(self, downloader=None):
1284                 InfoExtractor.__init__(self, downloader)
1285
1286         def report_download_page(self, query, pagenum):
1287                 """Report attempt to download playlist page with given number."""
1288                 query = query.decode(preferredencoding())
1289                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1290
1291         def _real_extract(self, query):
1292                 mobj = re.match(self._VALID_URL, query)
1293                 if mobj is None:
1294                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1295                         return
1296
1297                 prefix, query = query.split(':')
1298                 prefix = prefix[8:]
1299                 query = query.encode('utf-8')
1300                 if prefix == '':
1301                         self._download_n_results(query, 1)
1302                         return
1303                 elif prefix == 'all':
1304                         self._download_n_results(query, self._max_youtube_results)
1305                         return
1306                 else:
1307                         try:
1308                                 n = long(prefix)
1309                                 if n <= 0:
1310                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1311                                         return
1312                                 elif n > self._max_youtube_results:
1313                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1314                                         n = self._max_youtube_results
1315                                 self._download_n_results(query, n)
1316                                 return
1317                         except ValueError: # parsing prefix as integer fails
1318                                 self._download_n_results(query, 1)
1319                                 return
1320
1321         def _download_n_results(self, query, n):
1322                 """Downloads a specified number of results for a query"""
1323
1324                 video_ids = []
1325                 pagenum = 0
1326                 limit = n
1327
1328                 while (50 * pagenum) < limit:
1329                         self.report_download_page(query, pagenum+1)
1330                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1331                         request = urllib2.Request(result_url)
1332                         try:
1333                                 data = urllib2.urlopen(request).read()
1334                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1335                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1336                                 return
1337                         api_response = json.loads(data)['data']
1338
1339                         new_ids = list(video['id'] for video in api_response['items'])
1340                         video_ids += new_ids
1341
1342                         limit = min(n, api_response['totalItems'])
1343                         pagenum += 1
1344
1345                 if len(video_ids) > n:
1346                         video_ids = video_ids[:n]
1347                 for id in video_ids:
1348                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1349                 return
1350
1351
1352 class GoogleSearchIE(InfoExtractor):
1353         """Information Extractor for Google Video search queries."""
1354         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1355         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1356         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1357         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1358         _max_google_results = 1000
1359         IE_NAME = u'video.google:search'
1360
1361         def __init__(self, downloader=None):
1362                 InfoExtractor.__init__(self, downloader)
1363
1364         def report_download_page(self, query, pagenum):
1365                 """Report attempt to download playlist page with given number."""
1366                 query = query.decode(preferredencoding())
1367                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1368
1369         def _real_extract(self, query):
1370                 mobj = re.match(self._VALID_URL, query)
1371                 if mobj is None:
1372                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1373                         return
1374
1375                 prefix, query = query.split(':')
1376                 prefix = prefix[8:]
1377                 query = query.encode('utf-8')
1378                 if prefix == '':
1379                         self._download_n_results(query, 1)
1380                         return
1381                 elif prefix == 'all':
1382                         self._download_n_results(query, self._max_google_results)
1383                         return
1384                 else:
1385                         try:
1386                                 n = long(prefix)
1387                                 if n <= 0:
1388                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1389                                         return
1390                                 elif n > self._max_google_results:
1391                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1392                                         n = self._max_google_results
1393                                 self._download_n_results(query, n)
1394                                 return
1395                         except ValueError: # parsing prefix as integer fails
1396                                 self._download_n_results(query, 1)
1397                                 return
1398
1399         def _download_n_results(self, query, n):
1400                 """Downloads a specified number of results for a query"""
1401
1402                 video_ids = []
1403                 pagenum = 0
1404
1405                 while True:
1406                         self.report_download_page(query, pagenum)
1407                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1408                         request = urllib2.Request(result_url)
1409                         try:
1410                                 page = urllib2.urlopen(request).read()
1411                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1412                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1413                                 return
1414
1415                         # Extract video identifiers
1416                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1417                                 video_id = mobj.group(1)
1418                                 if video_id not in video_ids:
1419                                         video_ids.append(video_id)
1420                                         if len(video_ids) == n:
1421                                                 # Specified n videos reached
1422                                                 for id in video_ids:
1423                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1424                                                 return
1425
1426                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1427                                 for id in video_ids:
1428                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1429                                 return
1430
1431                         pagenum = pagenum + 1
1432
1433
1434 class YahooSearchIE(InfoExtractor):
1435         """Information Extractor for Yahoo! Video search queries."""
1436         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1437         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1438         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1439         _MORE_PAGES_INDICATOR = r'\s*Next'
1440         _max_yahoo_results = 1000
1441         IE_NAME = u'video.yahoo:search'
1442
1443         def __init__(self, downloader=None):
1444                 InfoExtractor.__init__(self, downloader)
1445
1446         def report_download_page(self, query, pagenum):
1447                 """Report attempt to download playlist page with given number."""
1448                 query = query.decode(preferredencoding())
1449                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1450
1451         def _real_extract(self, query):
1452                 mobj = re.match(self._VALID_URL, query)
1453                 if mobj is None:
1454                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1455                         return
1456
1457                 prefix, query = query.split(':')
1458                 prefix = prefix[8:]
1459                 query = query.encode('utf-8')
1460                 if prefix == '':
1461                         self._download_n_results(query, 1)
1462                         return
1463                 elif prefix == 'all':
1464                         self._download_n_results(query, self._max_yahoo_results)
1465                         return
1466                 else:
1467                         try:
1468                                 n = long(prefix)
1469                                 if n <= 0:
1470                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1471                                         return
1472                                 elif n > self._max_yahoo_results:
1473                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1474                                         n = self._max_yahoo_results
1475                                 self._download_n_results(query, n)
1476                                 return
1477                         except ValueError: # parsing prefix as integer fails
1478                                 self._download_n_results(query, 1)
1479                                 return
1480
1481         def _download_n_results(self, query, n):
1482                 """Downloads a specified number of results for a query"""
1483
1484                 video_ids = []
1485                 already_seen = set()
1486                 pagenum = 1
1487
1488                 while True:
1489                         self.report_download_page(query, pagenum)
1490                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1491                         request = urllib2.Request(result_url)
1492                         try:
1493                                 page = urllib2.urlopen(request).read()
1494                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1495                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1496                                 return
1497
1498                         # Extract video identifiers
1499                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1500                                 video_id = mobj.group(1)
1501                                 if video_id not in already_seen:
1502                                         video_ids.append(video_id)
1503                                         already_seen.add(video_id)
1504                                         if len(video_ids) == n:
1505                                                 # Specified n videos reached
1506                                                 for id in video_ids:
1507                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1508                                                 return
1509
1510                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1511                                 for id in video_ids:
1512                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1513                                 return
1514
1515                         pagenum = pagenum + 1
1516
1517
1518 class YoutubePlaylistIE(InfoExtractor):
1519         """Information Extractor for YouTube playlists."""
1520
1521         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1522         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1523         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
1524         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1525         IE_NAME = u'youtube:playlist'
1526
1527         def __init__(self, downloader=None):
1528                 InfoExtractor.__init__(self, downloader)
1529
1530         def report_download_page(self, playlist_id, pagenum):
1531                 """Report attempt to download playlist page with given number."""
1532                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1533
1534         def _real_extract(self, url):
1535                 # Extract playlist id
1536                 mobj = re.match(self._VALID_URL, url)
1537                 if mobj is None:
1538                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1539                         return
1540
1541                 # Single video case
1542                 if mobj.group(3) is not None:
1543                         self._downloader.download([mobj.group(3)])
1544                         return
1545
1546                 # Download playlist pages
1547                 # prefix is 'p' as default for playlists but there are other types that need extra care
1548                 playlist_prefix = mobj.group(1)
1549                 if playlist_prefix == 'a':
1550                         playlist_access = 'artist'
1551                 else:
1552                         playlist_prefix = 'p'
1553                         playlist_access = 'view_play_list'
1554                 playlist_id = mobj.group(2)
1555                 video_ids = []
1556                 pagenum = 1
1557
1558                 while True:
1559                         self.report_download_page(playlist_id, pagenum)
1560                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1561                         request = urllib2.Request(url)
1562                         try:
1563                                 page = urllib2.urlopen(request).read()
1564                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1565                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1566                                 return
1567
1568                         # Extract video identifiers
1569                         ids_in_page = []
1570                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1571                                 if mobj.group(1) not in ids_in_page:
1572                                         ids_in_page.append(mobj.group(1))
1573                         video_ids.extend(ids_in_page)
1574
1575                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1576                                 break
1577                         pagenum = pagenum + 1
1578
1579                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1580                 playlistend = self._downloader.params.get('playlistend', -1)
1581                 if playlistend == -1:
1582                         video_ids = video_ids[playliststart:]
1583                 else:
1584                         video_ids = video_ids[playliststart:playlistend]
1585
1586                 for id in video_ids:
1587                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1588                 return
1589
1590
1591 class YoutubeUserIE(InfoExtractor):
1592         """Information Extractor for YouTube users."""
1593
1594         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1595         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1596         _GDATA_PAGE_SIZE = 50
1597         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1598         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1599         IE_NAME = u'youtube:user'
1600
1601         def __init__(self, downloader=None):
1602                 InfoExtractor.__init__(self, downloader)
1603
1604         def report_download_page(self, username, start_index):
1605                 """Report attempt to download user page."""
1606                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1607                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1608
1609         def _real_extract(self, url):
1610                 # Extract username
1611                 mobj = re.match(self._VALID_URL, url)
1612                 if mobj is None:
1613                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1614                         return
1615
1616                 username = mobj.group(1)
1617
1618                 # Download video ids using YouTube Data API. Result size per
1619                 # query is limited (currently to 50 videos) so we need to query
1620                 # page by page until there are no video ids - it means we got
1621                 # all of them.
1622
1623                 video_ids = []
1624                 pagenum = 0
1625
1626                 while True:
1627                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1628                         self.report_download_page(username, start_index)
1629
1630                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1631
1632                         try:
1633                                 page = urllib2.urlopen(request).read()
1634                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1635                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1636                                 return
1637
1638                         # Extract video identifiers
1639                         ids_in_page = []
1640
1641                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1642                                 if mobj.group(1) not in ids_in_page:
1643                                         ids_in_page.append(mobj.group(1))
1644
1645                         video_ids.extend(ids_in_page)
1646
1647                         # A little optimization - if current page is not
1648                         # "full", ie. does not contain PAGE_SIZE video ids then
1649                         # we can assume that this page is the last one - there
1650                         # are no more ids on further pages - no need to query
1651                         # again.
1652
1653                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1654                                 break
1655
1656                         pagenum += 1
1657
1658                 all_ids_count = len(video_ids)
1659                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1660                 playlistend = self._downloader.params.get('playlistend', -1)
1661
1662                 if playlistend == -1:
1663                         video_ids = video_ids[playliststart:]
1664                 else:
1665                         video_ids = video_ids[playliststart:playlistend]
1666
1667                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1668                                 (username, all_ids_count, len(video_ids)))
1669
1670                 for video_id in video_ids:
1671                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1672
1673
1674 class DepositFilesIE(InfoExtractor):
1675         """Information extractor for depositfiles.com"""
1676
1677         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1678         IE_NAME = u'DepositFiles'
1679
1680         def __init__(self, downloader=None):
1681                 InfoExtractor.__init__(self, downloader)
1682
1683         def report_download_webpage(self, file_id):
1684                 """Report webpage download."""
1685                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1686
1687         def report_extraction(self, file_id):
1688                 """Report information extraction."""
1689                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1690
1691         def _real_extract(self, url):
1692                 file_id = url.split('/')[-1]
1693                 # Rebuild url in english locale
1694                 url = 'http://depositfiles.com/en/files/' + file_id
1695
1696                 # Retrieve file webpage with 'Free download' button pressed
1697                 free_download_indication = { 'gateway_result' : '1' }
1698                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1699                 try:
1700                         self.report_download_webpage(file_id)
1701                         webpage = urllib2.urlopen(request).read()
1702                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1703                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1704                         return
1705
1706                 # Search for the real file URL
1707                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1708                 if (mobj is None) or (mobj.group(1) is None):
1709                         # Try to figure out reason of the error.
1710                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1711                         if (mobj is not None) and (mobj.group(1) is not None):
1712                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1713                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1714                         else:
1715                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1716                         return
1717
1718                 file_url = mobj.group(1)
1719                 file_extension = os.path.splitext(file_url)[1][1:]
1720
1721                 # Search for file title
1722                 mobj = re.search(r'<b title="(.*?)">', webpage)
1723                 if mobj is None:
1724                         self._downloader.trouble(u'ERROR: unable to extract title')
1725                         return
1726                 file_title = mobj.group(1).decode('utf-8')
1727
1728                 return [{
1729                         'id':           file_id.decode('utf-8'),
1730                         'url':          file_url.decode('utf-8'),
1731                         'uploader':     u'NA',
1732                         'upload_date':  u'NA',
1733                         'title':        file_title,
1734                         'stitle':       file_title,
1735                         'ext':          file_extension.decode('utf-8'),
1736                         'format':       u'NA',
1737                         'player_url':   None,
1738                 }]
1739
1740
1741 class FacebookIE(InfoExtractor):
1742         """Information Extractor for Facebook"""
1743
1744         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1745         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1746         _NETRC_MACHINE = 'facebook'
1747         _available_formats = ['video', 'highqual', 'lowqual']
1748         _video_extensions = {
1749                 'video': 'mp4',
1750                 'highqual': 'mp4',
1751                 'lowqual': 'mp4',
1752         }
1753         IE_NAME = u'facebook'
1754
1755         def __init__(self, downloader=None):
1756                 InfoExtractor.__init__(self, downloader)
1757
1758         def _reporter(self, message):
1759                 """Add header and report message."""
1760                 self._downloader.to_screen(u'[facebook] %s' % message)
1761
1762         def report_login(self):
1763                 """Report attempt to log in."""
1764                 self._reporter(u'Logging in')
1765
1766         def report_video_webpage_download(self, video_id):
1767                 """Report attempt to download video webpage."""
1768                 self._reporter(u'%s: Downloading video webpage' % video_id)
1769
1770         def report_information_extraction(self, video_id):
1771                 """Report attempt to extract video information."""
1772                 self._reporter(u'%s: Extracting video information' % video_id)
1773
1774         def _parse_page(self, video_webpage):
1775                 """Extract video information from page"""
1776                 # General data
1777                 data = {'title': r'\("video_title", "(.*?)"\)',
1778                         'description': r'<div class="datawrap">(.*?)</div>',
1779                         'owner': r'\("video_owner_name", "(.*?)"\)',
1780                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1781                         }
1782                 video_info = {}
1783                 for piece in data.keys():
1784                         mobj = re.search(data[piece], video_webpage)
1785                         if mobj is not None:
1786                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1787
1788                 # Video urls
1789                 video_urls = {}
1790                 for fmt in self._available_formats:
1791                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1792                         if mobj is not None:
1793                                 # URL is in a Javascript segment inside an escaped Unicode format within
1794                                 # the generally utf-8 page
1795                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1796                 video_info['video_urls'] = video_urls
1797
1798                 return video_info
1799
1800         def _real_initialize(self):
1801                 if self._downloader is None:
1802                         return
1803
1804                 useremail = None
1805                 password = None
1806                 downloader_params = self._downloader.params
1807
1808                 # Attempt to use provided username and password or .netrc data
1809                 if downloader_params.get('username', None) is not None:
1810                         useremail = downloader_params['username']
1811                         password = downloader_params['password']
1812                 elif downloader_params.get('usenetrc', False):
1813                         try:
1814                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1815                                 if info is not None:
1816                                         useremail = info[0]
1817                                         password = info[2]
1818                                 else:
1819                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1820                         except (IOError, netrc.NetrcParseError), err:
1821                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1822                                 return
1823
1824                 if useremail is None:
1825                         return
1826
1827                 # Log in
1828                 login_form = {
1829                         'email': useremail,
1830                         'pass': password,
1831                         'login': 'Log+In'
1832                         }
1833                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1834                 try:
1835                         self.report_login()
1836                         login_results = urllib2.urlopen(request).read()
1837                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1838                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1839                                 return
1840                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1841                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1842                         return
1843
1844         def _real_extract(self, url):
1845                 mobj = re.match(self._VALID_URL, url)
1846                 if mobj is None:
1847                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1848                         return
1849                 video_id = mobj.group('ID')
1850
1851                 # Get video webpage
1852                 self.report_video_webpage_download(video_id)
1853                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1854                 try:
1855                         page = urllib2.urlopen(request)
1856                         video_webpage = page.read()
1857                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1858                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1859                         return
1860
1861                 # Start extracting information
1862                 self.report_information_extraction(video_id)
1863
1864                 # Extract information
1865                 video_info = self._parse_page(video_webpage)
1866
1867                 # uploader
1868                 if 'owner' not in video_info:
1869                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1870                         return
1871                 video_uploader = video_info['owner']
1872
1873                 # title
1874                 if 'title' not in video_info:
1875                         self._downloader.trouble(u'ERROR: unable to extract video title')
1876                         return
1877                 video_title = video_info['title']
1878                 video_title = video_title.decode('utf-8')
1879                 video_title = sanitize_title(video_title)
1880
1881                 simple_title = simplify_title(video_title)
1882
1883                 # thumbnail image
1884                 if 'thumbnail' not in video_info:
1885                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1886                         video_thumbnail = ''
1887                 else:
1888                         video_thumbnail = video_info['thumbnail']
1889
1890                 # upload date
1891                 upload_date = u'NA'
1892                 if 'upload_date' in video_info:
1893                         upload_time = video_info['upload_date']
1894                         timetuple = email.utils.parsedate_tz(upload_time)
1895                         if timetuple is not None:
1896                                 try:
1897                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1898                                 except:
1899                                         pass
1900
1901                 # description
1902                 video_description = video_info.get('description', 'No description available.')
1903
1904                 url_map = video_info['video_urls']
1905                 if len(url_map.keys()) > 0:
1906                         # Decide which formats to download
1907                         req_format = self._downloader.params.get('format', None)
1908                         format_limit = self._downloader.params.get('format_limit', None)
1909
1910                         if format_limit is not None and format_limit in self._available_formats:
1911                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1912                         else:
1913                                 format_list = self._available_formats
1914                         existing_formats = [x for x in format_list if x in url_map]
1915                         if len(existing_formats) == 0:
1916                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1917                                 return
1918                         if req_format is None:
1919                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1920                         elif req_format == 'worst':
1921                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1922                         elif req_format == '-1':
1923                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1924                         else:
1925                                 # Specific format
1926                                 if req_format not in url_map:
1927                                         self._downloader.trouble(u'ERROR: requested format not available')
1928                                         return
1929                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1930
1931                 results = []
1932                 for format_param, video_real_url in video_url_list:
1933                         # Extension
1934                         video_extension = self._video_extensions.get(format_param, 'mp4')
1935
1936                         results.append({
1937                                 'id':           video_id.decode('utf-8'),
1938                                 'url':          video_real_url.decode('utf-8'),
1939                                 'uploader':     video_uploader.decode('utf-8'),
1940                                 'upload_date':  upload_date,
1941                                 'title':        video_title,
1942                                 'stitle':       simple_title,
1943                                 'ext':          video_extension.decode('utf-8'),
1944                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1945                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1946                                 'description':  video_description.decode('utf-8'),
1947                                 'player_url':   None,
1948                         })
1949                 return results
1950
1951 class BlipTVIE(InfoExtractor):
1952         """Information extractor for blip.tv"""
1953
1954         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1955         _URL_EXT = r'^.*\.([a-z0-9]+)$'
1956         IE_NAME = u'blip.tv'
1957
1958         def report_extraction(self, file_id):
1959                 """Report information extraction."""
1960                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1961
1962         def report_direct_download(self, title):
1963                 """Report information extraction."""
1964                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1965
1966         def _real_extract(self, url):
1967                 mobj = re.match(self._VALID_URL, url)
1968                 if mobj is None:
1969                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1970                         return
1971
1972                 if '?' in url:
1973                         cchar = '&'
1974                 else:
1975                         cchar = '?'
1976                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1977                 request = urllib2.Request(json_url)
1978                 self.report_extraction(mobj.group(1))
1979                 info = None
1980                 try:
1981                         urlh = urllib2.urlopen(request)
1982                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1983                                 basename = url.split('/')[-1]
1984                                 title,ext = os.path.splitext(basename)
1985                                 title = title.decode('UTF-8')
1986                                 ext = ext.replace('.', '')
1987                                 self.report_direct_download(title)
1988                                 info = {
1989                                         'id': title,
1990                                         'url': url,
1991                                         'title': title,
1992                                         'stitle': simplify_title(title),
1993                                         'ext': ext,
1994                                         'urlhandle': urlh
1995                                 }
1996                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1997                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1998                         return
1999                 if info is None: # Regular URL
2000                         try:
2001                                 json_code = urlh.read()
2002                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2003                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2004                                 return
2005
2006                         try:
2007                                 json_data = json.loads(json_code)
2008                                 if 'Post' in json_data:
2009                                         data = json_data['Post']
2010                                 else:
2011                                         data = json_data
2012
2013                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2014                                 video_url = data['media']['url']
2015                                 umobj = re.match(self._URL_EXT, video_url)
2016                                 if umobj is None:
2017                                         raise ValueError('Can not determine filename extension')
2018                                 ext = umobj.group(1)
2019
2020                                 info = {
2021                                         'id': data['item_id'],
2022                                         'url': video_url,
2023                                         'uploader': data['display_name'],
2024                                         'upload_date': upload_date,
2025                                         'title': data['title'],
2026                                         'stitle': simplify_title(data['title']),
2027                                         'ext': ext,
2028                                         'format': data['media']['mimeType'],
2029                                         'thumbnail': data['thumbnailUrl'],
2030                                         'description': data['description'],
2031                                         'player_url': data['embedUrl']
2032                                 }
2033                         except (ValueError,KeyError), err:
2034                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2035                                 return
2036
2037                 return [info]
2038
2039
2040 class MyVideoIE(InfoExtractor):
2041         """Information Extractor for myvideo.de."""
2042
2043         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2044         IE_NAME = u'myvideo'
2045
2046         def __init__(self, downloader=None):
2047                 InfoExtractor.__init__(self, downloader)
2048
2049         def report_download_webpage(self, video_id):
2050                 """Report webpage download."""
2051                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2052
2053         def report_extraction(self, video_id):
2054                 """Report information extraction."""
2055                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2056
2057         def _real_extract(self,url):
2058                 mobj = re.match(self._VALID_URL, url)
2059                 if mobj is None:
2060                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2061                         return
2062
2063                 video_id = mobj.group(1)
2064
2065                 # Get video webpage
2066                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2067                 try:
2068                         self.report_download_webpage(video_id)
2069                         webpage = urllib2.urlopen(request).read()
2070                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2071                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2072                         return
2073
2074                 self.report_extraction(video_id)
2075                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2076                                  webpage)
2077                 if mobj is None:
2078                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2079                         return
2080                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2081
2082                 mobj = re.search('<title>([^<]+)</title>', webpage)
2083                 if mobj is None:
2084                         self._downloader.trouble(u'ERROR: unable to extract title')
2085                         return
2086
2087                 video_title = mobj.group(1)
2088                 video_title = sanitize_title(video_title)
2089
2090                 simple_title = simplify_title(video_title)
2091
2092                 return [{
2093                         'id':           video_id,
2094                         'url':          video_url,
2095                         'uploader':     u'NA',
2096                         'upload_date':  u'NA',
2097                         'title':        video_title,
2098                         'stitle':       simple_title,
2099                         'ext':          u'flv',
2100                         'format':       u'NA',
2101                         'player_url':   None,
2102                 }]
2103
2104 class ComedyCentralIE(InfoExtractor):
2105         """Information extractor for The Daily Show and Colbert Report """
2106
2107         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2108         IE_NAME = u'comedycentral'
2109
2110         def report_extraction(self, episode_id):
2111                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2112
2113         def report_config_download(self, episode_id):
2114                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2115
2116         def report_index_download(self, episode_id):
2117                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2118
2119         def report_player_url(self, episode_id):
2120                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2121
2122         def _real_extract(self, url):
2123                 mobj = re.match(self._VALID_URL, url)
2124                 if mobj is None:
2125                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2126                         return
2127
2128                 if mobj.group('shortname'):
2129                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2130                                 url = u'http://www.thedailyshow.com/full-episodes/'
2131                         else:
2132                                 url = u'http://www.colbertnation.com/full-episodes/'
2133                         mobj = re.match(self._VALID_URL, url)
2134                         assert mobj is not None
2135
2136                 dlNewest = not mobj.group('episode')
2137                 if dlNewest:
2138                         epTitle = mobj.group('showname')
2139                 else:
2140                         epTitle = mobj.group('episode')
2141
2142                 req = urllib2.Request(url)
2143                 self.report_extraction(epTitle)
2144                 try:
2145                         htmlHandle = urllib2.urlopen(req)
2146                         html = htmlHandle.read()
2147                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2148                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2149                         return
2150                 if dlNewest:
2151                         url = htmlHandle.geturl()
2152                         mobj = re.match(self._VALID_URL, url)
2153                         if mobj is None:
2154                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2155                                 return
2156                         if mobj.group('episode') == '':
2157                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2158                                 return
2159                         epTitle = mobj.group('episode')
2160
2161                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2162                 if len(mMovieParams) == 0:
2163                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2164                         return
2165
2166                 playerUrl_raw = mMovieParams[0][0]
2167                 self.report_player_url(epTitle)
2168                 try:
2169                         urlHandle = urllib2.urlopen(playerUrl_raw)
2170                         playerUrl = urlHandle.geturl()
2171                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2172                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2173                         return
2174
2175                 uri = mMovieParams[0][1]
2176                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2177                 self.report_index_download(epTitle)
2178                 try:
2179                         indexXml = urllib2.urlopen(indexUrl).read()
2180                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2181                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2182                         return
2183
2184                 results = []
2185
2186                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2187                 itemEls = idoc.findall('.//item')
2188                 for itemEl in itemEls:
2189                         mediaId = itemEl.findall('./guid')[0].text
2190                         shortMediaId = mediaId.split(':')[-1]
2191                         showId = mediaId.split(':')[-2].replace('.com', '')
2192                         officialTitle = itemEl.findall('./title')[0].text
2193                         officialDate = itemEl.findall('./pubDate')[0].text
2194
2195                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2196                                                 urllib.urlencode({'uri': mediaId}))
2197                         configReq = urllib2.Request(configUrl)
2198                         self.report_config_download(epTitle)
2199                         try:
2200                                 configXml = urllib2.urlopen(configReq).read()
2201                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2202                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2203                                 return
2204
2205                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2206                         turls = []
2207                         for rendition in cdoc.findall('.//rendition'):
2208                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2209                                 turls.append(finfo)
2210
2211                         if len(turls) == 0:
2212                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2213                                 continue
2214
2215                         # For now, just pick the highest bitrate
2216                         format,video_url = turls[-1]
2217
2218                         effTitle = showId + u'-' + epTitle
2219                         info = {
2220                                 'id': shortMediaId,
2221                                 'url': video_url,
2222                                 'uploader': showId,
2223                                 'upload_date': officialDate,
2224                                 'title': effTitle,
2225                                 'stitle': simplify_title(effTitle),
2226                                 'ext': 'mp4',
2227                                 'format': format,
2228                                 'thumbnail': None,
2229                                 'description': officialTitle,
2230                                 'player_url': playerUrl
2231                         }
2232
2233                         results.append(info)
2234
2235                 return results
2236
2237
2238 class EscapistIE(InfoExtractor):
2239         """Information extractor for The Escapist """
2240
2241         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2242         IE_NAME = u'escapist'
2243
2244         def report_extraction(self, showName):
2245                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2246
2247         def report_config_download(self, showName):
2248                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2249
2250         def _real_extract(self, url):
2251                 htmlParser = HTMLParser.HTMLParser()
2252
2253                 mobj = re.match(self._VALID_URL, url)
2254                 if mobj is None:
2255                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2256                         return
2257                 showName = mobj.group('showname')
2258                 videoId = mobj.group('episode')
2259
2260                 self.report_extraction(showName)
2261                 try:
2262                         webPage = urllib2.urlopen(url).read()
2263                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2264                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2265                         return
2266
2267                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2268                 description = htmlParser.unescape(descMatch.group(1))
2269                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2270                 imgUrl = htmlParser.unescape(imgMatch.group(1))
2271                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2272                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
2273                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2274                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2275
2276                 self.report_config_download(showName)
2277                 try:
2278                         configJSON = urllib2.urlopen(configUrl).read()
2279                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2280                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2281                         return
2282
2283                 # Technically, it's JavaScript, not JSON
2284                 configJSON = configJSON.replace("'", '"')
2285
2286                 try:
2287                         config = json.loads(configJSON)
2288                 except (ValueError,), err:
2289                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2290                         return
2291
2292                 playlist = config['playlist']
2293                 videoUrl = playlist[1]['url']
2294
2295                 info = {
2296                         'id': videoId,
2297                         'url': videoUrl,
2298                         'uploader': showName,
2299                         'upload_date': None,
2300                         'title': showName,
2301                         'stitle': simplify_title(showName),
2302                         'ext': 'flv',
2303                         'format': 'flv',
2304                         'thumbnail': imgUrl,
2305                         'description': description,
2306                         'player_url': playerUrl,
2307                 }
2308
2309                 return [info]
2310
2311
2312 class CollegeHumorIE(InfoExtractor):
2313         """Information extractor for collegehumor.com"""
2314
2315         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2316         IE_NAME = u'collegehumor'
2317
2318         def report_webpage(self, video_id):
2319                 """Report information extraction."""
2320                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2321
2322         def report_extraction(self, video_id):
2323                 """Report information extraction."""
2324                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2325
2326         def _real_extract(self, url):
2327                 htmlParser = HTMLParser.HTMLParser()
2328
2329                 mobj = re.match(self._VALID_URL, url)
2330                 if mobj is None:
2331                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2332                         return
2333                 video_id = mobj.group('videoid')
2334
2335                 self.report_webpage(video_id)
2336                 request = urllib2.Request(url)
2337                 try:
2338                         webpage = urllib2.urlopen(request).read()
2339                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2340                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2341                         return
2342
2343                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2344                 if m is None:
2345                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2346                         return
2347                 internal_video_id = m.group('internalvideoid')
2348
2349                 info = {
2350                         'id': video_id,
2351                         'internal_id': internal_video_id,
2352                 }
2353
2354                 self.report_extraction(video_id)
2355                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2356                 try:
2357                         metaXml = urllib2.urlopen(xmlUrl).read()
2358                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2359                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2360                         return
2361
2362                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2363                 try:
2364                         videoNode = mdoc.findall('./video')[0]
2365                         info['description'] = videoNode.findall('./description')[0].text
2366                         info['title'] = videoNode.findall('./caption')[0].text
2367                         info['stitle'] = simplify_title(info['title'])
2368                         info['url'] = videoNode.findall('./file')[0].text
2369                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2370                         info['ext'] = info['url'].rpartition('.')[2]
2371                         info['format'] = info['ext']
2372                 except IndexError:
2373                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2374                         return
2375
2376                 return [info]
2377
2378
2379 class XVideosIE(InfoExtractor):
2380         """Information extractor for xvideos.com"""
2381
2382         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2383         IE_NAME = u'xvideos'
2384
2385         def report_webpage(self, video_id):
2386                 """Report information extraction."""
2387                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2388
2389         def report_extraction(self, video_id):
2390                 """Report information extraction."""
2391                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2392
2393         def _real_extract(self, url):
2394                 htmlParser = HTMLParser.HTMLParser()
2395
2396                 mobj = re.match(self._VALID_URL, url)
2397                 if mobj is None:
2398                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2399                         return
2400                 video_id = mobj.group(1).decode('utf-8')
2401
2402                 self.report_webpage(video_id)
2403
2404                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2405                 try:
2406                         webpage = urllib2.urlopen(request).read()
2407                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2408                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2409                         return
2410
2411                 self.report_extraction(video_id)
2412
2413
2414                 # Extract video URL
2415                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2416                 if mobj is None:
2417                         self._downloader.trouble(u'ERROR: unable to extract video url')
2418                         return
2419                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2420
2421
2422                 # Extract title
2423                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2424                 if mobj is None:
2425                         self._downloader.trouble(u'ERROR: unable to extract video title')
2426                         return
2427                 video_title = mobj.group(1).decode('utf-8')
2428
2429
2430                 # Extract video thumbnail
2431                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2432                 if mobj is None:
2433                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2434                         return
2435                 video_thumbnail = mobj.group(1).decode('utf-8')
2436
2437                 info = {
2438                         'id': video_id,
2439                         'url': video_url,
2440                         'uploader': None,
2441                         'upload_date': None,
2442                         'title': video_title,
2443                         'stitle': simplify_title(video_title),
2444                         'ext': 'flv',
2445                         'format': 'flv',
2446                         'thumbnail': video_thumbnail,
2447                         'description': None,
2448                         'player_url': None,
2449                 }
2450
2451                 return [info]
2452
2453
2454 class SoundcloudIE(InfoExtractor):
2455         """Information extractor for soundcloud.com
2456            To access the media, the uid of the song and a stream token
2457            must be extracted from the page source and the script must make
2458            a request to media.soundcloud.com/crossdomain.xml. Then
2459            the media can be grabbed by requesting from an url composed
2460            of the stream token and uid
2461          """
2462
2463         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2464         IE_NAME = u'soundcloud'
2465
2466         def __init__(self, downloader=None):
2467                 InfoExtractor.__init__(self, downloader)
2468
2469         def report_webpage(self, video_id):
2470                 """Report information extraction."""
2471                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2472
2473         def report_extraction(self, video_id):
2474                 """Report information extraction."""
2475                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2476
2477         def _real_extract(self, url):
2478                 htmlParser = HTMLParser.HTMLParser()
2479
2480                 mobj = re.match(self._VALID_URL, url)
2481                 if mobj is None:
2482                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2483                         return
2484
2485                 # extract uploader (which is in the url)
2486                 uploader = mobj.group(1).decode('utf-8')
2487                 # extract simple title (uploader + slug of song title)
2488                 slug_title =  mobj.group(2).decode('utf-8')
2489                 simple_title = uploader + '-' + slug_title
2490
2491                 self.report_webpage('%s/%s' % (uploader, slug_title))
2492
2493                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2494                 try:
2495                         webpage = urllib2.urlopen(request).read()
2496                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2497                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2498                         return
2499
2500                 self.report_extraction('%s/%s' % (uploader, slug_title))
2501
2502                 # extract uid and stream token that soundcloud hands out for access
2503                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2504                 if mobj:
2505                         video_id = mobj.group(1)
2506                         stream_token = mobj.group(2)
2507
2508                 # extract unsimplified title
2509                 mobj = re.search('"title":"(.*?)",', webpage)
2510                 if mobj:
2511                         title = mobj.group(1)
2512
2513                 # construct media url (with uid/token)
2514                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2515                 mediaURL = mediaURL % (video_id, stream_token)
2516
2517                 # description
2518                 description = u'No description available'
2519                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2520                 if mobj:
2521                         description = mobj.group(1)
2522
2523                 # upload date
2524                 upload_date = None
2525                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2526                 if mobj:
2527                         try:
2528                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2529                         except Exception, e:
2530                                 print str(e)
2531
2532                 # for soundcloud, a request to a cross domain is required for cookies
2533                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2534
2535                 return [{
2536                         'id':           video_id.decode('utf-8'),
2537                         'url':          mediaURL,
2538                         'uploader':     uploader.decode('utf-8'),
2539                         'upload_date':  upload_date,
2540                         'title':        simple_title.decode('utf-8'),
2541                         'stitle':       simple_title.decode('utf-8'),
2542                         'ext':          u'mp3',
2543                         'format':       u'NA',
2544                         'player_url':   None,
2545                         'description': description.decode('utf-8')
2546                 }]
2547
2548
2549 class InfoQIE(InfoExtractor):
2550         """Information extractor for infoq.com"""
2551
2552         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2553         IE_NAME = u'infoq'
2554
2555         def report_webpage(self, video_id):
2556                 """Report information extraction."""
2557                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2558
2559         def report_extraction(self, video_id):
2560                 """Report information extraction."""
2561                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2562
2563         def _real_extract(self, url):
2564                 htmlParser = HTMLParser.HTMLParser()
2565
2566                 mobj = re.match(self._VALID_URL, url)
2567                 if mobj is None:
2568                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2569                         return
2570
2571                 self.report_webpage(url)
2572
2573                 request = urllib2.Request(url)
2574                 try:
2575                         webpage = urllib2.urlopen(request).read()
2576                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2577                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2578                         return
2579
2580                 self.report_extraction(url)
2581
2582
2583                 # Extract video URL
2584                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2585                 if mobj is None:
2586                         self._downloader.trouble(u'ERROR: unable to extract video url')
2587                         return
2588                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2589
2590
2591                 # Extract title
2592                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2593                 if mobj is None:
2594                         self._downloader.trouble(u'ERROR: unable to extract video title')
2595                         return
2596                 video_title = mobj.group(1).decode('utf-8')
2597
2598                 # Extract description
2599                 video_description = u'No description available.'
2600                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2601                 if mobj is not None:
2602                         video_description = mobj.group(1).decode('utf-8')
2603
2604                 video_filename = video_url.split('/')[-1]
2605                 video_id, extension = video_filename.split('.')
2606
2607                 info = {
2608                         'id': video_id,
2609                         'url': video_url,
2610                         'uploader': None,
2611                         'upload_date': None,
2612                         'title': video_title,
2613                         'stitle': simplify_title(video_title),
2614                         'ext': extension,
2615                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2616                         'thumbnail': None,
2617                         'description': video_description,
2618                         'player_url': None,
2619                 }
2620
2621                 return [info]
2622
2623 class MixcloudIE(InfoExtractor):
2624         """Information extractor for www.mixcloud.com"""
2625         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2626         IE_NAME = u'mixcloud'
2627
2628         def __init__(self, downloader=None):
2629                 InfoExtractor.__init__(self, downloader)
2630
2631         def report_download_json(self, file_id):
2632                 """Report JSON download."""
2633                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2634
2635         def report_extraction(self, file_id):
2636                 """Report information extraction."""
2637                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2638
2639         def get_urls(self, jsonData, fmt, bitrate='best'):
2640                 """Get urls from 'audio_formats' section in json"""
2641                 file_url = None
2642                 try:
2643                         bitrate_list = jsonData[fmt]
2644                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2645                                 bitrate = max(bitrate_list) # select highest
2646
2647                         url_list = jsonData[fmt][bitrate]
2648                 except TypeError: # we have no bitrate info.
2649                         url_list = jsonData[fmt]
2650
2651                 return url_list
2652
2653         def check_urls(self, url_list):
2654                 """Returns 1st active url from list"""
2655                 for url in url_list:
2656                         try:
2657                                 urllib2.urlopen(url)
2658                                 return url
2659                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2660                                 url = None
2661
2662                 return None
2663
2664         def _print_formats(self, formats):
2665                 print 'Available formats:'
2666                 for fmt in formats.keys():
2667                         for b in formats[fmt]:
2668                                 try:
2669                                         ext = formats[fmt][b][0]
2670                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2671                                 except TypeError: # we have no bitrate info
2672                                         ext = formats[fmt][0]
2673                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2674                                         break
2675
2676         def _real_extract(self, url):
2677                 mobj = re.match(self._VALID_URL, url)
2678                 if mobj is None:
2679                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2680                         return
2681                 # extract uploader & filename from url
2682                 uploader = mobj.group(1).decode('utf-8')
2683                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2684
2685                 # construct API request
2686                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2687                 # retrieve .json file with links to files
2688                 request = urllib2.Request(file_url)
2689                 try:
2690                         self.report_download_json(file_url)
2691                         jsonData = urllib2.urlopen(request).read()
2692                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2693                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2694                         return
2695
2696                 # parse JSON
2697                 json_data = json.loads(jsonData)
2698                 player_url = json_data['player_swf_url']
2699                 formats = dict(json_data['audio_formats'])
2700
2701                 req_format = self._downloader.params.get('format', None)
2702                 bitrate = None
2703
2704                 if self._downloader.params.get('listformats', None):
2705                         self._print_formats(formats)
2706                         return
2707
2708                 if req_format is None or req_format == 'best':
2709                         for format_param in formats.keys():
2710                                 url_list = self.get_urls(formats, format_param)
2711                                 # check urls
2712                                 file_url = self.check_urls(url_list)
2713                                 if file_url is not None:
2714                                         break # got it!
2715                 else:
2716                         if req_format not in formats.keys():
2717                                 self._downloader.trouble(u'ERROR: format is not available')
2718                                 return
2719
2720                         url_list = self.get_urls(formats, req_format)
2721                         file_url = self.check_urls(url_list)
2722                         format_param = req_format
2723
2724                 return [{
2725                         'id': file_id.decode('utf-8'),
2726                         'url': file_url.decode('utf-8'),
2727                         'uploader':     uploader.decode('utf-8'),
2728                         'upload_date': u'NA',
2729                         'title': json_data['name'],
2730                         'stitle': simplify_title(json_data['name']),
2731                         'ext': file_url.split('.')[-1].decode('utf-8'),
2732                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2733                         'thumbnail': json_data['thumbnail_url'],
2734                         'description': json_data['description'],
2735                         'player_url': player_url.decode('utf-8'),
2736                 }]
2737
2738 class StanfordOpenClassroomIE(InfoExtractor):
2739         """Information extractor for Stanford's Open ClassRoom"""
2740
2741         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2742         IE_NAME = u'stanfordoc'
2743
2744         def report_download_webpage(self, objid):
2745                 """Report information extraction."""
2746                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2747
2748         def report_extraction(self, video_id):
2749                 """Report information extraction."""
2750                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2751
2752         def _real_extract(self, url):
2753                 mobj = re.match(self._VALID_URL, url)
2754                 if mobj is None:
2755                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2756                         return
2757
2758                 if mobj.group('course') and mobj.group('video'): # A specific video
2759                         course = mobj.group('course')
2760                         video = mobj.group('video')
2761                         info = {
2762                                 'id': simplify_title(course + '_' + video),
2763                         }
2764
2765                         self.report_extraction(info['id'])
2766                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2767                         xmlUrl = baseUrl + video + '.xml'
2768                         try:
2769                                 metaXml = urllib2.urlopen(xmlUrl).read()
2770                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2771                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2772                                 return
2773                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2774                         try:
2775                                 info['title'] = mdoc.findall('./title')[0].text
2776                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2777                         except IndexError:
2778                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2779                                 return
2780                         info['stitle'] = simplify_title(info['title'])
2781                         info['ext'] = info['url'].rpartition('.')[2]
2782                         info['format'] = info['ext']
2783                         return [info]
2784                 elif mobj.group('course'): # A course page
2785                         unescapeHTML = HTMLParser.HTMLParser().unescape
2786
2787                         course = mobj.group('course')
2788                         info = {
2789                                 'id': simplify_title(course),
2790                                 'type': 'playlist',
2791                         }
2792
2793                         self.report_download_webpage(info['id'])
2794                         try:
2795                                 coursepage = urllib2.urlopen(url).read()
2796                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2797                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2798                                 return
2799
2800                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2801                         if m:
2802                                 info['title'] = unescapeHTML(m.group(1))
2803                         else:
2804                                 info['title'] = info['id']
2805                         info['stitle'] = simplify_title(info['title'])
2806
2807                         m = re.search('<description>([^<]+)</description>', coursepage)
2808                         if m:
2809                                 info['description'] = unescapeHTML(m.group(1))
2810
2811                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2812                         info['list'] = [
2813                                 {
2814                                         'type': 'reference',
2815                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2816                                 }
2817                                         for vpage in links]
2818                         results = []
2819                         for entry in info['list']:
2820                                 assert entry['type'] == 'reference'
2821                                 results += self.extract(entry['url'])
2822                         return results
2823
2824                 else: # Root page
2825                         unescapeHTML = HTMLParser.HTMLParser().unescape
2826
2827                         info = {
2828                                 'id': 'Stanford OpenClassroom',
2829                                 'type': 'playlist',
2830                         }
2831
2832                         self.report_download_webpage(info['id'])
2833                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2834                         try:
2835                                 rootpage = urllib2.urlopen(rootURL).read()
2836                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2837                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2838                                 return
2839
2840                         info['title'] = info['id']
2841                         info['stitle'] = simplify_title(info['title'])
2842
2843                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2844                         info['list'] = [
2845                                 {
2846                                         'type': 'reference',
2847                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2848                                 }
2849                                         for cpage in links]
2850
2851                         results = []
2852                         for entry in info['list']:
2853                                 assert entry['type'] == 'reference'
2854                                 results += self.extract(entry['url'])
2855                         return results
2856
2857 class MTVIE(InfoExtractor):
2858         """Information extractor for MTV.com"""
2859
2860         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2861         IE_NAME = u'mtv'
2862
2863         def report_webpage(self, video_id):
2864                 """Report information extraction."""
2865                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2866
2867         def report_extraction(self, video_id):
2868                 """Report information extraction."""
2869                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2870
2871         def _real_extract(self, url):
2872                 mobj = re.match(self._VALID_URL, url)
2873                 if mobj is None:
2874                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2875                         return
2876                 if not mobj.group('proto'):
2877                         url = 'http://' + url
2878                 video_id = mobj.group('videoid')
2879                 self.report_webpage(video_id)
2880
2881                 request = urllib2.Request(url)
2882                 try:
2883                         webpage = urllib2.urlopen(request).read()
2884                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2885                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2886                         return
2887
2888                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2889                 if mobj is None:
2890                         self._downloader.trouble(u'ERROR: unable to extract song name')
2891                         return
2892                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2893                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2894                 if mobj is None:
2895                         self._downloader.trouble(u'ERROR: unable to extract performer')
2896                         return
2897                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2898                 video_title = performer + ' - ' + song_name
2899
2900                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2901                 if mobj is None:
2902                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2903                         return
2904                 mtvn_uri = mobj.group(1)
2905
2906                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2907                 if mobj is None:
2908                         self._downloader.trouble(u'ERROR: unable to extract content id')
2909                         return
2910                 content_id = mobj.group(1)
2911
2912                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2913                 self.report_extraction(video_id)
2914                 request = urllib2.Request(videogen_url)
2915                 try:
2916                         metadataXml = urllib2.urlopen(request).read()
2917                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2918                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2919                         return
2920
2921                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2922                 renditions = mdoc.findall('.//rendition')
2923
2924                 # For now, always pick the highest quality.
2925                 rendition = renditions[-1]
2926
2927                 try:
2928                         _,_,ext = rendition.attrib['type'].partition('/')
2929                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2930                         video_url = rendition.find('./src').text
2931                 except KeyError:
2932                         self._downloader.trouble('Invalid rendition field.')
2933                         return
2934
2935                 info = {
2936                         'id': video_id,
2937                         'url': video_url,
2938                         'uploader': performer,
2939                         'title': video_title,
2940                         'stitle': simplify_title(video_title),
2941                         'ext': ext,
2942                         'format': format,
2943                 }
2944
2945                 return [info]