youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15
  16 try:
  17         import cStringIO as StringIO
  18 except ImportError:
  19         import StringIO
  20
  21 # parse_qs was moved from the cgi module to the urlparse module recently.
  22 try:
  23         from urlparse import parse_qs
  24 except ImportError:
  25         from cgi import parse_qs
  26
  27 try:
  28         import lxml.etree
  29 except ImportError:
  30         pass # Handled below
  31
  32 try:
  33         import xml.etree.ElementTree
  34 except ImportError: # Python<2.5: Not officially supported, but let it slip
  35         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  36
  37 from Utils import *
  38
  39
  40 class InfoExtractor(object):
  41         """Information Extractor class.
  42
  43         Information extractors are the classes that, given a URL, extract
  44         information from the video (or videos) the URL refers to. This
  45         information includes the real video URL, the video title and simplified
  46         title, author and others. The information is stored in a dictionary
  47         which is then passed to the FileDownloader. The FileDownloader
  48         processes this information possibly downloading the video to the file
  49         system, among other possible outcomes. The dictionaries must include
  50         the following fields:
  51
  52         id:             Video identifier.
  53         url:            Final video URL.
  54         uploader:       Nickname of the video uploader.
  55         title:          Literal title.
  56         stitle:         Simplified title.
  57         ext:            Video filename extension.
  58         format:         Video format.
  59         player_url:     SWF Player URL (may be None).
  60
  61         The following fields are optional. Their primary purpose is to allow
  62         youtube-dl to serve as the backend for a video search function, such
  63         as the one in youtube2mp3.  They are only used when their respective
  64         forced printing functions are called:
  65
  66         thumbnail:      Full URL to a video thumbnail image.
  67         description:    One-line video description.
  68
  69         Subclasses of this one should re-define the _real_initialize() and
  70         _real_extract() methods and define a _VALID_URL regexp.
  71         Probably, they should also be added to the list of extractors.
  72         """
  73
  74         _ready = False
  75         _downloader = None
  76
  77         def __init__(self, downloader=None):
  78                 """Constructor. Receives an optional downloader."""
  79                 self._ready = False
  80                 self.set_downloader(downloader)
  81
  82         def suitable(self, url):
  83                 """Receives a URL and returns True if suitable for this IE."""
  84                 return re.match(self._VALID_URL, url) is not None
  85
  86         def initialize(self):
  87                 """Initializes an instance (authentication, etc)."""
  88                 if not self._ready:
  89                         self._real_initialize()
  90                         self._ready = True
  91
  92         def extract(self, url):
  93                 """Extracts URL information and returns it in list of dicts."""
  94                 self.initialize()
  95                 return self._real_extract(url)
  96
  97         def set_downloader(self, downloader):
  98                 """Sets the downloader for this IE."""
  99                 self._downloader = downloader
 100
 101         def _real_initialize(self):
 102                 """Real initialization process. Redefine in subclasses."""
 103                 pass
 104
 105         def _real_extract(self, url):
 106                 """Real extraction process. Redefine in subclasses."""
 107                 pass
 108
 109
 110 class YoutubeIE(InfoExtractor):
 111         """Information extractor for youtube.com."""
 112
 113         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
 114         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 115         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 116         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 117         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 118         _NETRC_MACHINE = 'youtube'
 119         # Listed in order of quality
 120         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 121         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 122         _video_extensions = {
 123                 '13': '3gp',
 124                 '17': 'mp4',
 125                 '18': 'mp4',
 126                 '22': 'mp4',
 127                 '37': 'mp4',
 128                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 129                 '43': 'webm',
 130                 '44': 'webm',
 131                 '45': 'webm',
 132         }
 133         _video_dimensions = {
 134                 '5': '240x400',
 135                 '6': '???',
 136                 '13': '???',
 137                 '17': '144x176',
 138                 '18': '360x640',
 139                 '22': '720x1280',
 140                 '34': '360x640',
 141                 '35': '480x854',
 142                 '37': '1080x1920',
 143                 '38': '3072x4096',
 144                 '43': '360x640',
 145                 '44': '480x854',
 146                 '45': '720x1280',
 147         }
 148         IE_NAME = u'youtube'
 149
 150         def report_lang(self):
 151                 """Report attempt to set language."""
 152                 self._downloader.to_screen(u'[youtube] Setting language')
 153
 154         def report_login(self):
 155                 """Report attempt to log in."""
 156                 self._downloader.to_screen(u'[youtube] Logging in')
 157
 158         def report_age_confirmation(self):
 159                 """Report attempt to confirm age."""
 160                 self._downloader.to_screen(u'[youtube] Confirming age')
 161
 162         def report_video_webpage_download(self, video_id):
 163                 """Report attempt to download video webpage."""
 164                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 165
 166         def report_video_info_webpage_download(self, video_id):
 167                 """Report attempt to download video info webpage."""
 168                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 169
 170         def report_video_subtitles_download(self, video_id):
 171                 """Report attempt to download video info webpage."""
 172                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 173
 174         def report_information_extraction(self, video_id):
 175                 """Report attempt to extract video information."""
 176                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 177
 178         def report_unavailable_format(self, video_id, format):
 179                 """Report extracted video URL."""
 180                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 181
 182         def report_rtmp_download(self):
 183                 """Indicate the download will use the RTMP protocol."""
 184                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 185
 186         def _closed_captions_xml_to_srt(self, xml_string):
 187                 srt = ''
 188                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 189                 # TODO parse xml instead of regex
 190                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 191                         if not dur: dur = '4'
 192                         start = float(start)
 193                         end = start + float(dur)
 194                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 195                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 196                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
 197                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
 198                         srt += str(n) + '\n'
 199                         srt += start + ' --> ' + end + '\n'
 200                         srt += caption + '\n\n'
 201                 return srt
 202
 203         def _print_formats(self, formats):
 204                 print 'Available formats:'
 205                 for x in formats:
 206                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 207
 208         def _real_initialize(self):
 209                 if self._downloader is None:
 210                         return
 211
 212                 username = None
 213                 password = None
 214                 downloader_params = self._downloader.params
 215
 216                 # Attempt to use provided username and password or .netrc data
 217                 if downloader_params.get('username', None) is not None:
 218                         username = downloader_params['username']
 219                         password = downloader_params['password']
 220                 elif downloader_params.get('usenetrc', False):
 221                         try:
 222                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 223                                 if info is not None:
 224                                         username = info[0]
 225                                         password = info[2]
 226                                 else:
 227                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 228                         except (IOError, netrc.NetrcParseError), err:
 229                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 230                                 return
 231
 232                 # Set language
 233                 request = urllib2.Request(self._LANG_URL)
 234                 try:
 235                         self.report_lang()
 236                         urllib2.urlopen(request).read()
 237                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 238                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 239                         return
 240
 241                 # No authentication to be performed
 242                 if username is None:
 243                         return
 244
 245                 # Log in
 246                 login_form = {
 247                                 'current_form': 'loginForm',
 248                                 'next':         '/',
 249                                 'action_login': 'Log In',
 250                                 'username':     username,
 251                                 'password':     password,
 252                                 }
 253                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 254                 try:
 255                         self.report_login()
 256                         login_results = urllib2.urlopen(request).read()
 257                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 258                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 259                                 return
 260                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 261                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 262                         return
 263
 264                 # Confirm age
 265                 age_form = {
 266                                 'next_url':             '/',
 267                                 'action_confirm':       'Confirm',
 268                                 }
 269                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 270                 try:
 271                         self.report_age_confirmation()
 272                         age_results = urllib2.urlopen(request).read()
 273                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 274                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 275                         return
 276
 277         def _real_extract(self, url):
 278                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 279                 mobj = re.search(self._NEXT_URL_RE, url)
 280                 if mobj:
 281                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 282
 283                 # Extract video id from URL
 284                 mobj = re.match(self._VALID_URL, url)
 285                 if mobj is None:
 286                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 287                         return
 288                 video_id = mobj.group(2)
 289
 290                 # Get video webpage
 291                 self.report_video_webpage_download(video_id)
 292                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 293                 try:
 294                         video_webpage = urllib2.urlopen(request).read()
 295                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 296                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 297                         return
 298
 299                 # Attempt to extract SWF player URL
 300                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 301                 if mobj is not None:
 302                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 303                 else:
 304                         player_url = None
 305
 306                 # Get video info
 307                 self.report_video_info_webpage_download(video_id)
 308                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 309                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 310                                         % (video_id, el_type))
 311                         request = urllib2.Request(video_info_url)
 312                         try:
 313                                 video_info_webpage = urllib2.urlopen(request).read()
 314                                 video_info = parse_qs(video_info_webpage)
 315                                 if 'token' in video_info:
 316                                         break
 317                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 318                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 319                                 return
 320                 if 'token' not in video_info:
 321                         if 'reason' in video_info:
 322                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 323                         else:
 324                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 325                         return
 326
 327                 # Start extracting information
 328                 self.report_information_extraction(video_id)
 329
 330                 # uploader
 331                 if 'author' not in video_info:
 332                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 333                         return
 334                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 335
 336                 # title
 337                 if 'title' not in video_info:
 338                         self._downloader.trouble(u'ERROR: unable to extract video title')
 339                         return
 340                 video_title = urllib.unquote_plus(video_info['title'][0])
 341                 video_title = video_title.decode('utf-8')
 342                 video_title = sanitize_title(video_title)
 343
 344                 # simplified title
 345                 simple_title = simplify_title(video_title)
 346
 347                 # thumbnail image
 348                 if 'thumbnail_url' not in video_info:
 349                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 350                         video_thumbnail = ''
 351                 else:   # don't panic if we can't find it
 352                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 353
 354                 # upload date
 355                 upload_date = u'NA'
 356                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 357                 if mobj is not None:
 358                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 359                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 360                         for expression in format_expressions:
 361                                 try:
 362                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 363                                 except:
 364                                         pass
 365
 366                 # description
 367                 try:
 368                         lxml.etree
 369                 except NameError:
 370                         video_description = u'No description available.'
 371                         mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
 372                         if mobj is not None:
 373                                 video_description = mobj.group(1).decode('utf-8')
 374                 else:
 375                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
 376                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
 377                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
 378                         # TODO use another parser
 379
 380                 # closed captions
 381                 video_subtitles = None
 382                 if self._downloader.params.get('writesubtitles', False):
 383                         self.report_video_subtitles_download(video_id)
 384                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 385                         try:
 386                                 srt_list = urllib2.urlopen(request).read()
 387                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 388                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 389                         else:
 390                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
 391                                 if srt_lang_list:
 392                                         if self._downloader.params.get('subtitleslang', False):
 393                                                 srt_lang = self._downloader.params.get('subtitleslang')
 394                                         elif 'en' in srt_lang_list:
 395                                                 srt_lang = 'en'
 396                                         else:
 397                                                 srt_lang = srt_lang_list[0]
 398                                         if not srt_lang in srt_lang_list:
 399                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
 400                                         else:
 401                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
 402                                                 try:
 403                                                         srt_xml = urllib2.urlopen(request).read()
 404                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 405                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 406                                                 else:
 407                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 408                                 else:
 409                                         self._downloader.trouble(u'WARNING: video has no closed captions')
 410
 411                 # token
 412                 video_token = urllib.unquote_plus(video_info['token'][0])
 413
 414                 # Decide which formats to download
 415                 req_format = self._downloader.params.get('format', None)
 416
 417                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 418                         self.report_rtmp_download()
 419                         video_url_list = [(None, video_info['conn'][0])]
 420                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 421                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 422                         url_data = [parse_qs(uds) for uds in url_data_strs]
 423                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 424                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
 425
 426                         format_limit = self._downloader.params.get('format_limit', None)
 427                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 428                         if format_limit is not None and format_limit in available_formats:
 429                                 format_list = available_formats[available_formats.index(format_limit):]
 430                         else:
 431                                 format_list = available_formats
 432                         existing_formats = [x for x in format_list if x in url_map]
 433                         if len(existing_formats) == 0:
 434                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 435                                 return
 436                         if self._downloader.params.get('listformats', None):
 437                                 self._print_formats(existing_formats)
 438                                 return
 439                         if req_format is None or req_format == 'best':
 440                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 441                         elif req_format == 'worst':
 442                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 443                         elif req_format in ('-1', 'all'):
 444                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 445                         else:
 446                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 447                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 448                                 req_formats = req_format.split('/')
 449                                 video_url_list = None
 450                                 for rf in req_formats:
 451                                         if rf in url_map:
 452                                                 video_url_list = [(rf, url_map[rf])]
 453                                                 break
 454                                 if video_url_list is None:
 455                                         self._downloader.trouble(u'ERROR: requested format not available')
 456                                         return
 457                 else:
 458                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 459                         return
 460
 461                 for format_param, video_real_url in video_url_list:
 462                         # At this point we have a new video
 463                         self._downloader.increment_downloads()
 464
 465                         # Extension
 466                         video_extension = self._video_extensions.get(format_param, 'flv')
 467
 468                         try:
 469                                 # Process video information
 470                                 self._downloader.process_info({
 471                                         'id':           video_id.decode('utf-8'),
 472                                         'url':          video_real_url.decode('utf-8'),
 473                                         'uploader':     video_uploader.decode('utf-8'),
 474                                         'upload_date':  upload_date,
 475                                         'title':        video_title,
 476                                         'stitle':       simple_title,
 477                                         'ext':          video_extension.decode('utf-8'),
 478                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 479                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 480                                         'description':  video_description,
 481                                         'player_url':   player_url,
 482                                         'subtitles':    video_subtitles
 483                                 })
 484                         except UnavailableVideoError, err:
 485                                 self._downloader.trouble(u'\nERROR: unable to download video')
 486
 487
 488 class MetacafeIE(InfoExtractor):
 489         """Information Extractor for metacafe.com."""
 490
 491         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 492         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 493         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 494         _youtube_ie = None
 495         IE_NAME = u'metacafe'
 496
 497         def __init__(self, youtube_ie, downloader=None):
 498                 InfoExtractor.__init__(self, downloader)
 499                 self._youtube_ie = youtube_ie
 500
 501         def report_disclaimer(self):
 502                 """Report disclaimer retrieval."""
 503                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 504
 505         def report_age_confirmation(self):
 506                 """Report attempt to confirm age."""
 507                 self._downloader.to_screen(u'[metacafe] Confirming age')
 508
 509         def report_download_webpage(self, video_id):
 510                 """Report webpage download."""
 511                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 512
 513         def report_extraction(self, video_id):
 514                 """Report information extraction."""
 515                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 516
 517         def _real_initialize(self):
 518                 # Retrieve disclaimer
 519                 request = urllib2.Request(self._DISCLAIMER)
 520                 try:
 521                         self.report_disclaimer()
 522                         disclaimer = urllib2.urlopen(request).read()
 523                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 524                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 525                         return
 526
 527                 # Confirm age
 528                 disclaimer_form = {
 529                         'filters': '0',
 530                         'submit': "Continue - I'm over 18",
 531                         }
 532                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 533                 try:
 534                         self.report_age_confirmation()
 535                         disclaimer = urllib2.urlopen(request).read()
 536                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 537                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 538                         return
 539
 540         def _real_extract(self, url):
 541                 # Extract id and simplified title from URL
 542                 mobj = re.match(self._VALID_URL, url)
 543                 if mobj is None:
 544                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 545                         return
 546
 547                 video_id = mobj.group(1)
 548
 549                 # Check if video comes from YouTube
 550                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 551                 if mobj2 is not None:
 552                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 553                         return
 554
 555                 # At this point we have a new video
 556                 self._downloader.increment_downloads()
 557
 558                 simple_title = mobj.group(2).decode('utf-8')
 559
 560                 # Retrieve video webpage to extract further information
 561                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 562                 try:
 563                         self.report_download_webpage(video_id)
 564                         webpage = urllib2.urlopen(request).read()
 565                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 566                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 567                         return
 568
 569                 # Extract URL, uploader and title from webpage
 570                 self.report_extraction(video_id)
 571                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 572                 if mobj is not None:
 573                         mediaURL = urllib.unquote(mobj.group(1))
 574                         video_extension = mediaURL[-3:]
 575
 576                         # Extract gdaKey if available
 577                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 578                         if mobj is None:
 579                                 video_url = mediaURL
 580                         else:
 581                                 gdaKey = mobj.group(1)
 582                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 583                 else:
 584                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 585                         if mobj is None:
 586                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 587                                 return
 588                         vardict = parse_qs(mobj.group(1))
 589                         if 'mediaData' not in vardict:
 590                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 591                                 return
 592                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 593                         if mobj is None:
 594                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 595                                 return
 596                         mediaURL = mobj.group(1).replace('\\/', '/')
 597                         video_extension = mediaURL[-3:]
 598                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 599
 600                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 601                 if mobj is None:
 602                         self._downloader.trouble(u'ERROR: unable to extract title')
 603                         return
 604                 video_title = mobj.group(1).decode('utf-8')
 605                 video_title = sanitize_title(video_title)
 606
 607                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 608                 if mobj is None:
 609                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 610                         return
 611                 video_uploader = mobj.group(1)
 612
 613                 try:
 614                         # Process video information
 615                         self._downloader.process_info({
 616                                 'id':           video_id.decode('utf-8'),
 617                                 'url':          video_url.decode('utf-8'),
 618                                 'uploader':     video_uploader.decode('utf-8'),
 619                                 'upload_date':  u'NA',
 620                                 'title':        video_title,
 621                                 'stitle':       simple_title,
 622                                 'ext':          video_extension.decode('utf-8'),
 623                                 'format':       u'NA',
 624                                 'player_url':   None,
 625                         })
 626                 except UnavailableVideoError:
 627                         self._downloader.trouble(u'\nERROR: unable to download video')
 628
 629
 630 class DailymotionIE(InfoExtractor):
 631         """Information Extractor for Dailymotion"""
 632
 633         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
 634         IE_NAME = u'dailymotion'
 635
 636         def __init__(self, downloader=None):
 637                 InfoExtractor.__init__(self, downloader)
 638
 639         def report_download_webpage(self, video_id):
 640                 """Report webpage download."""
 641                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 642
 643         def report_extraction(self, video_id):
 644                 """Report information extraction."""
 645                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 646
 647         def _real_extract(self, url):
 648                 # Extract id and simplified title from URL
 649                 mobj = re.match(self._VALID_URL, url)
 650                 if mobj is None:
 651                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 652                         return
 653
 654                 # At this point we have a new video
 655                 self._downloader.increment_downloads()
 656                 video_id = mobj.group(1)
 657
 658                 video_extension = 'flv'
 659
 660                 # Retrieve video webpage to extract further information
 661                 request = urllib2.Request(url)
 662                 request.add_header('Cookie', 'family_filter=off')
 663                 try:
 664                         self.report_download_webpage(video_id)
 665                         webpage = urllib2.urlopen(request).read()
 666                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 667                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 668                         return
 669
 670                 # Extract URL, uploader and title from webpage
 671                 self.report_extraction(video_id)
 672                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
 673                 if mobj is None:
 674                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 675                         return
 676                 sequence = urllib.unquote(mobj.group(1))
 677                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
 678                 if mobj is None:
 679                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 680                         return
 681                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
 682
 683                 # if needed add http://www.dailymotion.com/ if relative URL
 684
 685                 video_url = mediaURL
 686
 687                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 688                 if mobj is None:
 689                         self._downloader.trouble(u'ERROR: unable to extract title')
 690                         return
 691                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 692                 video_title = sanitize_title(video_title)
 693                 simple_title = simplify_title(video_title)
 694
 695                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 696                 if mobj is None:
 697                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 698                         return
 699                 video_uploader = mobj.group(1)
 700
 701                 try:
 702                         # Process video information
 703                         self._downloader.process_info({
 704                                 'id':           video_id.decode('utf-8'),
 705                                 'url':          video_url.decode('utf-8'),
 706                                 'uploader':     video_uploader.decode('utf-8'),
 707                                 'upload_date':  u'NA',
 708                                 'title':        video_title,
 709                                 'stitle':       simple_title,
 710                                 'ext':          video_extension.decode('utf-8'),
 711                                 'format':       u'NA',
 712                                 'player_url':   None,
 713                         })
 714                 except UnavailableVideoError:
 715                         self._downloader.trouble(u'\nERROR: unable to download video')
 716
 717
 718 class GoogleIE(InfoExtractor):
 719         """Information extractor for video.google.com."""
 720
 721         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 722         IE_NAME = u'video.google'
 723
 724         def __init__(self, downloader=None):
 725                 InfoExtractor.__init__(self, downloader)
 726
 727         def report_download_webpage(self, video_id):
 728                 """Report webpage download."""
 729                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 730
 731         def report_extraction(self, video_id):
 732                 """Report information extraction."""
 733                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 734
 735         def _real_extract(self, url):
 736                 # Extract id from URL
 737                 mobj = re.match(self._VALID_URL, url)
 738                 if mobj is None:
 739                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 740                         return
 741
 742                 # At this point we have a new video
 743                 self._downloader.increment_downloads()
 744                 video_id = mobj.group(1)
 745
 746                 video_extension = 'mp4'
 747
 748                 # Retrieve video webpage to extract further information
 749                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 750                 try:
 751                         self.report_download_webpage(video_id)
 752                         webpage = urllib2.urlopen(request).read()
 753                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 754                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 755                         return
 756
 757                 # Extract URL, uploader, and title from webpage
 758                 self.report_extraction(video_id)
 759                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 760                 if mobj is None:
 761                         video_extension = 'flv'
 762                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 763                 if mobj is None:
 764                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 765                         return
 766                 mediaURL = urllib.unquote(mobj.group(1))
 767                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 768                 mediaURL = mediaURL.replace('\\x26', '\x26')
 769
 770                 video_url = mediaURL
 771
 772                 mobj = re.search(r'<title>(.*)</title>', webpage)
 773                 if mobj is None:
 774                         self._downloader.trouble(u'ERROR: unable to extract title')
 775                         return
 776                 video_title = mobj.group(1).decode('utf-8')
 777                 video_title = sanitize_title(video_title)
 778                 simple_title = simplify_title(video_title)
 779
 780                 # Extract video description
 781                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 782                 if mobj is None:
 783                         self._downloader.trouble(u'ERROR: unable to extract video description')
 784                         return
 785                 video_description = mobj.group(1).decode('utf-8')
 786                 if not video_description:
 787                         video_description = 'No description available.'
 788
 789                 # Extract video thumbnail
 790                 if self._downloader.params.get('forcethumbnail', False):
 791                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 792                         try:
 793                                 webpage = urllib2.urlopen(request).read()
 794                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 795                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 796                                 return
 797                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 798                         if mobj is None:
 799                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 800                                 return
 801                         video_thumbnail = mobj.group(1)
 802                 else:   # we need something to pass to process_info
 803                         video_thumbnail = ''
 804
 805                 try:
 806                         # Process video information
 807                         self._downloader.process_info({
 808                                 'id':           video_id.decode('utf-8'),
 809                                 'url':          video_url.decode('utf-8'),
 810                                 'uploader':     u'NA',
 811                                 'upload_date':  u'NA',
 812                                 'title':        video_title,
 813                                 'stitle':       simple_title,
 814                                 'ext':          video_extension.decode('utf-8'),
 815                                 'format':       u'NA',
 816                                 'player_url':   None,
 817                         })
 818                 except UnavailableVideoError:
 819                         self._downloader.trouble(u'\nERROR: unable to download video')
 820
 821
 822 class PhotobucketIE(InfoExtractor):
 823         """Information extractor for photobucket.com."""
 824
 825         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 826         IE_NAME = u'photobucket'
 827
 828         def __init__(self, downloader=None):
 829                 InfoExtractor.__init__(self, downloader)
 830
 831         def report_download_webpage(self, video_id):
 832                 """Report webpage download."""
 833                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 834
 835         def report_extraction(self, video_id):
 836                 """Report information extraction."""
 837                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 838
 839         def _real_extract(self, url):
 840                 # Extract id from URL
 841                 mobj = re.match(self._VALID_URL, url)
 842                 if mobj is None:
 843                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 844                         return
 845
 846                 # At this point we have a new video
 847                 self._downloader.increment_downloads()
 848                 video_id = mobj.group(1)
 849
 850                 video_extension = 'flv'
 851
 852                 # Retrieve video webpage to extract further information
 853                 request = urllib2.Request(url)
 854                 try:
 855                         self.report_download_webpage(video_id)
 856                         webpage = urllib2.urlopen(request).read()
 857                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 858                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 859                         return
 860
 861                 # Extract URL, uploader, and title from webpage
 862                 self.report_extraction(video_id)
 863                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 864                 if mobj is None:
 865                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 866                         return
 867                 mediaURL = urllib.unquote(mobj.group(1))
 868
 869                 video_url = mediaURL
 870
 871                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 872                 if mobj is None:
 873                         self._downloader.trouble(u'ERROR: unable to extract title')
 874                         return
 875                 video_title = mobj.group(1).decode('utf-8')
 876                 video_title = sanitize_title(video_title)
 877                 simple_title = simplify_title(video_title)
 878
 879                 video_uploader = mobj.group(2).decode('utf-8')
 880
 881                 try:
 882                         # Process video information
 883                         self._downloader.process_info({
 884                                 'id':           video_id.decode('utf-8'),
 885                                 'url':          video_url.decode('utf-8'),
 886                                 'uploader':     video_uploader,
 887                                 'upload_date':  u'NA',
 888                                 'title':        video_title,
 889                                 'stitle':       simple_title,
 890                                 'ext':          video_extension.decode('utf-8'),
 891                                 'format':       u'NA',
 892                                 'player_url':   None,
 893                         })
 894                 except UnavailableVideoError:
 895                         self._downloader.trouble(u'\nERROR: unable to download video')
 896
 897
 898 class YahooIE(InfoExtractor):
 899         """Information extractor for video.yahoo.com."""
 900
 901         # _VALID_URL matches all Yahoo! Video URLs
 902         # _VPAGE_URL matches only the extractable '/watch/' URLs
 903         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 904         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 905         IE_NAME = u'video.yahoo'
 906
 907         def __init__(self, downloader=None):
 908                 InfoExtractor.__init__(self, downloader)
 909
 910         def report_download_webpage(self, video_id):
 911                 """Report webpage download."""
 912                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 913
 914         def report_extraction(self, video_id):
 915                 """Report information extraction."""
 916                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 917
 918         def _real_extract(self, url, new_video=True):
 919                 # Extract ID from URL
 920                 mobj = re.match(self._VALID_URL, url)
 921                 if mobj is None:
 922                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 923                         return
 924
 925                 # At this point we have a new video
 926                 self._downloader.increment_downloads()
 927                 video_id = mobj.group(2)
 928                 video_extension = 'flv'
 929
 930                 # Rewrite valid but non-extractable URLs as
 931                 # extractable English language /watch/ URLs
 932                 if re.match(self._VPAGE_URL, url) is None:
 933                         request = urllib2.Request(url)
 934                         try:
 935                                 webpage = urllib2.urlopen(request).read()
 936                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 937                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 938                                 return
 939
 940                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 941                         if mobj is None:
 942                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 943                                 return
 944                         yahoo_id = mobj.group(1)
 945
 946                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 947                         if mobj is None:
 948                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 949                                 return
 950                         yahoo_vid = mobj.group(1)
 951
 952                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 953                         return self._real_extract(url, new_video=False)
 954
 955                 # Retrieve video webpage to extract further information
 956                 request = urllib2.Request(url)
 957                 try:
 958                         self.report_download_webpage(video_id)
 959                         webpage = urllib2.urlopen(request).read()
 960                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 961                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 962                         return
 963
 964                 # Extract uploader and title from webpage
 965                 self.report_extraction(video_id)
 966                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 967                 if mobj is None:
 968                         self._downloader.trouble(u'ERROR: unable to extract video title')
 969                         return
 970                 video_title = mobj.group(1).decode('utf-8')
 971                 simple_title = simplify_title(video_title)
 972
 973                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 974                 if mobj is None:
 975                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 976                         return
 977                 video_uploader = mobj.group(1).decode('utf-8')
 978
 979                 # Extract video thumbnail
 980                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 981                 if mobj is None:
 982                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 983                         return
 984                 video_thumbnail = mobj.group(1).decode('utf-8')
 985
 986                 # Extract video description
 987                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 988                 if mobj is None:
 989                         self._downloader.trouble(u'ERROR: unable to extract video description')
 990                         return
 991                 video_description = mobj.group(1).decode('utf-8')
 992                 if not video_description:
 993                         video_description = 'No description available.'
 994
 995                 # Extract video height and width
 996                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 997                 if mobj is None:
 998                         self._downloader.trouble(u'ERROR: unable to extract video height')
 999                         return
1000                 yv_video_height = mobj.group(1)
1001
1002                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1003                 if mobj is None:
1004                         self._downloader.trouble(u'ERROR: unable to extract video width')
1005                         return
1006                 yv_video_width = mobj.group(1)
1007
1008                 # Retrieve video playlist to extract media URL
1009                 # I'm not completely sure what all these options are, but we
1010                 # seem to need most of them, otherwise the server sends a 401.
1011                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1012                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1013                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1014                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1015                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1016                 try:
1017                         self.report_download_webpage(video_id)
1018                         webpage = urllib2.urlopen(request).read()
1019                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1020                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1021                         return
1022
1023                 # Extract media URL from playlist XML
1024                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1025                 if mobj is None:
1026                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1027                         return
1028                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1029                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1030
1031                 try:
1032                         # Process video information
1033                         self._downloader.process_info({
1034                                 'id':           video_id.decode('utf-8'),
1035                                 'url':          video_url,
1036                                 'uploader':     video_uploader,
1037                                 'upload_date':  u'NA',
1038                                 'title':        video_title,
1039                                 'stitle':       simple_title,
1040                                 'ext':          video_extension.decode('utf-8'),
1041                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1042                                 'description':  video_description,
1043                                 'thumbnail':    video_thumbnail,
1044                                 'player_url':   None,
1045                         })
1046                 except UnavailableVideoError:
1047                         self._downloader.trouble(u'\nERROR: unable to download video')
1048
1049
1050 class VimeoIE(InfoExtractor):
1051         """Information extractor for vimeo.com."""
1052
1053         # _VALID_URL matches Vimeo URLs
1054         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1055         IE_NAME = u'vimeo'
1056
1057         def __init__(self, downloader=None):
1058                 InfoExtractor.__init__(self, downloader)
1059
1060         def report_download_webpage(self, video_id):
1061                 """Report webpage download."""
1062                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1063
1064         def report_extraction(self, video_id):
1065                 """Report information extraction."""
1066                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1067
1068         def _real_extract(self, url, new_video=True):
1069                 # Extract ID from URL
1070                 mobj = re.match(self._VALID_URL, url)
1071                 if mobj is None:
1072                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1073                         return
1074
1075                 # At this point we have a new video
1076                 self._downloader.increment_downloads()
1077                 video_id = mobj.group(1)
1078
1079                 # Retrieve video webpage to extract further information
1080                 request = urllib2.Request(url, None, std_headers)
1081                 try:
1082                         self.report_download_webpage(video_id)
1083                         webpage = urllib2.urlopen(request).read()
1084                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1085                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1086                         return
1087
1088                 # Now we begin extracting as much information as we can from what we
1089                 # retrieved. First we extract the information common to all extractors,
1090                 # and latter we extract those that are Vimeo specific.
1091                 self.report_extraction(video_id)
1092
1093                 # Extract the config JSON
1094                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1095                 try:
1096                         config = json.loads(config)
1097                 except:
1098                         self._downloader.trouble(u'ERROR: unable to extract info section')
1099                         return
1100
1101                 # Extract title
1102                 video_title = config["video"]["title"]
1103                 simple_title = simplify_title(video_title)
1104
1105                 # Extract uploader
1106                 video_uploader = config["video"]["owner"]["name"]
1107
1108                 # Extract video thumbnail
1109                 video_thumbnail = config["video"]["thumbnail"]
1110
1111                 # Extract video description
1112                 try:
1113                         lxml.etree
1114                 except NameError:
1115                         video_description = u'No description available.'
1116                         mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
1117                         if mobj is not None:
1118                                 video_description = mobj.group(1)
1119                 else:
1120                         html_parser = lxml.etree.HTMLParser()
1121                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
1122                         video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
1123                         # TODO use another parser
1124
1125                 # Extract upload date
1126                 video_upload_date = u'NA'
1127                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1128                 if mobj is not None:
1129                         video_upload_date = mobj.group(1)
1130
1131                 # Vimeo specific: extract request signature and timestamp
1132                 sig = config['request']['signature']
1133                 timestamp = config['request']['timestamp']
1134
1135                 # Vimeo specific: extract video codec and quality information
1136                 # TODO bind to format param
1137                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1138                 for codec in codecs:
1139                         if codec[0] in config["video"]["files"]:
1140                                 video_codec = codec[0]
1141                                 video_extension = codec[1]
1142                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1143                                 else: quality = 'sd'
1144                                 break
1145                 else:
1146                         self._downloader.trouble(u'ERROR: no known codec found')
1147                         return
1148
1149                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1150                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1151
1152                 try:
1153                         # Process video information
1154                         self._downloader.process_info({
1155                                 'id':           video_id,
1156                                 'url':          video_url,
1157                                 'uploader':     video_uploader,
1158                                 'upload_date':  video_upload_date,
1159                                 'title':        video_title,
1160                                 'stitle':       simple_title,
1161                                 'ext':          video_extension,
1162                                 'thumbnail':    video_thumbnail,
1163                                 'description':  video_description,
1164                                 'player_url':   None,
1165                         })
1166                 except UnavailableVideoError:
1167                         self._downloader.trouble(u'ERROR: unable to download video')
1168
1169
1170 class GenericIE(InfoExtractor):
1171         """Generic last-resort information extractor."""
1172
1173         _VALID_URL = r'.*'
1174         IE_NAME = u'generic'
1175
1176         def __init__(self, downloader=None):
1177                 InfoExtractor.__init__(self, downloader)
1178
1179         def report_download_webpage(self, video_id):
1180                 """Report webpage download."""
1181                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1182                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1183
1184         def report_extraction(self, video_id):
1185                 """Report information extraction."""
1186                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1187
1188         def report_following_redirect(self, new_url):
1189                 """Report information extraction."""
1190                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1191
1192         def _test_redirect(self, url):
1193                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1194                 class HeadRequest(urllib2.Request):
1195                         def get_method(self):
1196                                 return "HEAD"
1197
1198                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1199                         """
1200                         Subclass the HTTPRedirectHandler to make it use our
1201                         HeadRequest also on the redirected URL
1202                         """
1203                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1204                                 if code in (301, 302, 303, 307):
1205                                     newurl = newurl.replace(' ', '%20')
1206                                     newheaders = dict((k,v) for k,v in req.headers.items()
1207                                                       if k.lower() not in ("content-length", "content-type"))
1208                                     return HeadRequest(newurl,
1209                                                        headers=newheaders,
1210                                                        origin_req_host=req.get_origin_req_host(),
1211                                                        unverifiable=True)
1212                                 else:
1213                                     raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1214
1215                 class HTTPMethodFallback(urllib2.BaseHandler):
1216                         """
1217                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1218                         """
1219                         def http_error_405(self, req, fp, code, msg, headers):
1220                                 fp.read()
1221                                 fp.close()
1222
1223                                 newheaders = dict((k,v) for k,v in req.headers.items()
1224                                                   if k.lower() not in ("content-length", "content-type"))
1225                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1226                                                                  headers=newheaders,
1227                                                                  origin_req_host=req.get_origin_req_host(),
1228                                                                  unverifiable=True))
1229
1230                 # Build our opener
1231                 opener = urllib2.OpenerDirector()
1232                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1233                                         HTTPMethodFallback, HEADRedirectHandler,
1234                                         urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1235                         opener.add_handler(handler())
1236
1237                 response = opener.open(HeadRequest(url))
1238                 new_url = response.geturl()
1239
1240                 if url == new_url: return False
1241
1242                 self.report_following_redirect(new_url)
1243                 self._downloader.download([new_url])
1244                 return True
1245
1246         def _real_extract(self, url):
1247                 if self._test_redirect(url): return
1248
1249                 # At this point we have a new video
1250                 self._downloader.increment_downloads()
1251
1252                 video_id = url.split('/')[-1]
1253                 request = urllib2.Request(url)
1254                 try:
1255                         self.report_download_webpage(video_id)
1256                         webpage = urllib2.urlopen(request).read()
1257                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1258                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1259                         return
1260                 except ValueError, err:
1261                         # since this is the last-resort InfoExtractor, if
1262                         # this error is thrown, it'll be thrown here
1263                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1264                         return
1265
1266                 self.report_extraction(video_id)
1267                 # Start with something easy: JW Player in SWFObject
1268                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1269                 if mobj is None:
1270                         # Broaden the search a little bit
1271                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1272                 if mobj is None:
1273                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1274                         return
1275
1276                 # It's possible that one of the regexes
1277                 # matched, but returned an empty group:
1278                 if mobj.group(1) is None:
1279                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1280                         return
1281
1282                 video_url = urllib.unquote(mobj.group(1))
1283                 video_id = os.path.basename(video_url)
1284
1285                 # here's a fun little line of code for you:
1286                 video_extension = os.path.splitext(video_id)[1][1:]
1287                 video_id = os.path.splitext(video_id)[0]
1288
1289                 # it's tempting to parse this further, but you would
1290                 # have to take into account all the variations like
1291                 #   Video Title - Site Name
1292                 #   Site Name | Video Title
1293                 #   Video Title - Tagline | Site Name
1294                 # and so on and so forth; it's just not practical
1295                 mobj = re.search(r'<title>(.*)</title>', webpage)
1296                 if mobj is None:
1297                         self._downloader.trouble(u'ERROR: unable to extract title')
1298                         return
1299                 video_title = mobj.group(1).decode('utf-8')
1300                 video_title = sanitize_title(video_title)
1301                 simple_title = simplify_title(video_title)
1302
1303                 # video uploader is domain name
1304                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1305                 if mobj is None:
1306                         self._downloader.trouble(u'ERROR: unable to extract title')
1307                         return
1308                 video_uploader = mobj.group(1).decode('utf-8')
1309
1310                 try:
1311                         # Process video information
1312                         self._downloader.process_info({
1313                                 'id':           video_id.decode('utf-8'),
1314                                 'url':          video_url.decode('utf-8'),
1315                                 'uploader':     video_uploader,
1316                                 'upload_date':  u'NA',
1317                                 'title':        video_title,
1318                                 'stitle':       simple_title,
1319                                 'ext':          video_extension.decode('utf-8'),
1320                                 'format':       u'NA',
1321                                 'player_url':   None,
1322                         })
1323                 except UnavailableVideoError, err:
1324                         self._downloader.trouble(u'\nERROR: unable to download video')
1325
1326
1327 class YoutubeSearchIE(InfoExtractor):
1328         """Information Extractor for YouTube search queries."""
1329         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1330         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1331         _youtube_ie = None
1332         _max_youtube_results = 1000
1333         IE_NAME = u'youtube:search'
1334
1335         def __init__(self, youtube_ie, downloader=None):
1336                 InfoExtractor.__init__(self, downloader)
1337                 self._youtube_ie = youtube_ie
1338
1339         def report_download_page(self, query, pagenum):
1340                 """Report attempt to download playlist page with given number."""
1341                 query = query.decode(preferredencoding())
1342                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1343
1344         def _real_initialize(self):
1345                 self._youtube_ie.initialize()
1346
1347         def _real_extract(self, query):
1348                 mobj = re.match(self._VALID_URL, query)
1349                 if mobj is None:
1350                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1351                         return
1352
1353                 prefix, query = query.split(':')
1354                 prefix = prefix[8:]
1355                 query = query.encode('utf-8')
1356                 if prefix == '':
1357                         self._download_n_results(query, 1)
1358                         return
1359                 elif prefix == 'all':
1360                         self._download_n_results(query, self._max_youtube_results)
1361                         return
1362                 else:
1363                         try:
1364                                 n = long(prefix)
1365                                 if n <= 0:
1366                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1367                                         return
1368                                 elif n > self._max_youtube_results:
1369                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1370                                         n = self._max_youtube_results
1371                                 self._download_n_results(query, n)
1372                                 return
1373                         except ValueError: # parsing prefix as integer fails
1374                                 self._download_n_results(query, 1)
1375                                 return
1376
1377         def _download_n_results(self, query, n):
1378                 """Downloads a specified number of results for a query"""
1379
1380                 video_ids = []
1381                 pagenum = 0
1382                 limit = n
1383
1384                 while (50 * pagenum) < limit:
1385                         self.report_download_page(query, pagenum+1)
1386                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1387                         request = urllib2.Request(result_url)
1388                         try:
1389                                 data = urllib2.urlopen(request).read()
1390                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1391                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1392                                 return
1393                         api_response = json.loads(data)['data']
1394
1395                         new_ids = list(video['id'] for video in api_response['items'])
1396                         video_ids += new_ids
1397
1398                         limit = min(n, api_response['totalItems'])
1399                         pagenum += 1
1400
1401                 if len(video_ids) > n:
1402                         video_ids = video_ids[:n]
1403                 for id in video_ids:
1404                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1405                 return
1406
1407
1408 class GoogleSearchIE(InfoExtractor):
1409         """Information Extractor for Google Video search queries."""
1410         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1411         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1412         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1413         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1414         _google_ie = None
1415         _max_google_results = 1000
1416         IE_NAME = u'video.google:search'
1417
1418         def __init__(self, google_ie, downloader=None):
1419                 InfoExtractor.__init__(self, downloader)
1420                 self._google_ie = google_ie
1421
1422         def report_download_page(self, query, pagenum):
1423                 """Report attempt to download playlist page with given number."""
1424                 query = query.decode(preferredencoding())
1425                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1426
1427         def _real_initialize(self):
1428                 self._google_ie.initialize()
1429
1430         def _real_extract(self, query):
1431                 mobj = re.match(self._VALID_URL, query)
1432                 if mobj is None:
1433                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1434                         return
1435
1436                 prefix, query = query.split(':')
1437                 prefix = prefix[8:]
1438                 query = query.encode('utf-8')
1439                 if prefix == '':
1440                         self._download_n_results(query, 1)
1441                         return
1442                 elif prefix == 'all':
1443                         self._download_n_results(query, self._max_google_results)
1444                         return
1445                 else:
1446                         try:
1447                                 n = long(prefix)
1448                                 if n <= 0:
1449                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1450                                         return
1451                                 elif n > self._max_google_results:
1452                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1453                                         n = self._max_google_results
1454                                 self._download_n_results(query, n)
1455                                 return
1456                         except ValueError: # parsing prefix as integer fails
1457                                 self._download_n_results(query, 1)
1458                                 return
1459
1460         def _download_n_results(self, query, n):
1461                 """Downloads a specified number of results for a query"""
1462
1463                 video_ids = []
1464                 pagenum = 0
1465
1466                 while True:
1467                         self.report_download_page(query, pagenum)
1468                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1469                         request = urllib2.Request(result_url)
1470                         try:
1471                                 page = urllib2.urlopen(request).read()
1472                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1473                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1474                                 return
1475
1476                         # Extract video identifiers
1477                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1478                                 video_id = mobj.group(1)
1479                                 if video_id not in video_ids:
1480                                         video_ids.append(video_id)
1481                                         if len(video_ids) == n:
1482                                                 # Specified n videos reached
1483                                                 for id in video_ids:
1484                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1485                                                 return
1486
1487                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1488                                 for id in video_ids:
1489                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1490                                 return
1491
1492                         pagenum = pagenum + 1
1493
1494
1495 class YahooSearchIE(InfoExtractor):
1496         """Information Extractor for Yahoo! Video search queries."""
1497         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1498         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1499         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1500         _MORE_PAGES_INDICATOR = r'\s*Next'
1501         _yahoo_ie = None
1502         _max_yahoo_results = 1000
1503         IE_NAME = u'video.yahoo:search'
1504
1505         def __init__(self, yahoo_ie, downloader=None):
1506                 InfoExtractor.__init__(self, downloader)
1507                 self._yahoo_ie = yahoo_ie
1508
1509         def report_download_page(self, query, pagenum):
1510                 """Report attempt to download playlist page with given number."""
1511                 query = query.decode(preferredencoding())
1512                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1513
1514         def _real_initialize(self):
1515                 self._yahoo_ie.initialize()
1516
1517         def _real_extract(self, query):
1518                 mobj = re.match(self._VALID_URL, query)
1519                 if mobj is None:
1520                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1521                         return
1522
1523                 prefix, query = query.split(':')
1524                 prefix = prefix[8:]
1525                 query = query.encode('utf-8')
1526                 if prefix == '':
1527                         self._download_n_results(query, 1)
1528                         return
1529                 elif prefix == 'all':
1530                         self._download_n_results(query, self._max_yahoo_results)
1531                         return
1532                 else:
1533                         try:
1534                                 n = long(prefix)
1535                                 if n <= 0:
1536                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1537                                         return
1538                                 elif n > self._max_yahoo_results:
1539                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1540                                         n = self._max_yahoo_results
1541                                 self._download_n_results(query, n)
1542                                 return
1543                         except ValueError: # parsing prefix as integer fails
1544                                 self._download_n_results(query, 1)
1545                                 return
1546
1547         def _download_n_results(self, query, n):
1548                 """Downloads a specified number of results for a query"""
1549
1550                 video_ids = []
1551                 already_seen = set()
1552                 pagenum = 1
1553
1554                 while True:
1555                         self.report_download_page(query, pagenum)
1556                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1557                         request = urllib2.Request(result_url)
1558                         try:
1559                                 page = urllib2.urlopen(request).read()
1560                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1561                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1562                                 return
1563
1564                         # Extract video identifiers
1565                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1566                                 video_id = mobj.group(1)
1567                                 if video_id not in already_seen:
1568                                         video_ids.append(video_id)
1569                                         already_seen.add(video_id)
1570                                         if len(video_ids) == n:
1571                                                 # Specified n videos reached
1572                                                 for id in video_ids:
1573                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1574                                                 return
1575
1576                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1577                                 for id in video_ids:
1578                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1579                                 return
1580
1581                         pagenum = pagenum + 1
1582
1583
1584 class YoutubePlaylistIE(InfoExtractor):
1585         """Information Extractor for YouTube playlists."""
1586
1587         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1588         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1589         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
1590         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1591         _youtube_ie = None
1592         IE_NAME = u'youtube:playlist'
1593
1594         def __init__(self, youtube_ie, downloader=None):
1595                 InfoExtractor.__init__(self, downloader)
1596                 self._youtube_ie = youtube_ie
1597
1598         def report_download_page(self, playlist_id, pagenum):
1599                 """Report attempt to download playlist page with given number."""
1600                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1601
1602         def _real_initialize(self):
1603                 self._youtube_ie.initialize()
1604
1605         def _real_extract(self, url):
1606                 # Extract playlist id
1607                 mobj = re.match(self._VALID_URL, url)
1608                 if mobj is None:
1609                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1610                         return
1611
1612                 # Single video case
1613                 if mobj.group(3) is not None:
1614                         self._youtube_ie.extract(mobj.group(3))
1615                         return
1616
1617                 # Download playlist pages
1618                 # prefix is 'p' as default for playlists but there are other types that need extra care
1619                 playlist_prefix = mobj.group(1)
1620                 if playlist_prefix == 'a':
1621                         playlist_access = 'artist'
1622                 else:
1623                         playlist_prefix = 'p'
1624                         playlist_access = 'view_play_list'
1625                 playlist_id = mobj.group(2)
1626                 video_ids = []
1627                 pagenum = 1
1628
1629                 while True:
1630                         self.report_download_page(playlist_id, pagenum)
1631                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1632                         request = urllib2.Request(url)
1633                         try:
1634                                 page = urllib2.urlopen(request).read()
1635                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1636                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1637                                 return
1638
1639                         # Extract video identifiers
1640                         ids_in_page = []
1641                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1642                                 if mobj.group(1) not in ids_in_page:
1643                                         ids_in_page.append(mobj.group(1))
1644                         video_ids.extend(ids_in_page)
1645
1646                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1647                                 break
1648                         pagenum = pagenum + 1
1649
1650                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1651                 playlistend = self._downloader.params.get('playlistend', -1)
1652                 if playlistend == -1:
1653                         video_ids = video_ids[playliststart:]
1654                 else:
1655                         video_ids = video_ids[playliststart:playlistend]
1656
1657                 for id in video_ids:
1658                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1659                 return
1660
1661
1662 class YoutubeUserIE(InfoExtractor):
1663         """Information Extractor for YouTube users."""
1664
1665         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1666         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1667         _GDATA_PAGE_SIZE = 50
1668         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1669         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1670         _youtube_ie = None
1671         IE_NAME = u'youtube:user'
1672
1673         def __init__(self, youtube_ie, downloader=None):
1674                 InfoExtractor.__init__(self, downloader)
1675                 self._youtube_ie = youtube_ie
1676
1677         def report_download_page(self, username, start_index):
1678                 """Report attempt to download user page."""
1679                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1680                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1681
1682         def _real_initialize(self):
1683                 self._youtube_ie.initialize()
1684
1685         def _real_extract(self, url):
1686                 # Extract username
1687                 mobj = re.match(self._VALID_URL, url)
1688                 if mobj is None:
1689                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1690                         return
1691
1692                 username = mobj.group(1)
1693
1694                 # Download video ids using YouTube Data API. Result size per
1695                 # query is limited (currently to 50 videos) so we need to query
1696                 # page by page until there are no video ids - it means we got
1697                 # all of them.
1698
1699                 video_ids = []
1700                 pagenum = 0
1701
1702                 while True:
1703                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1704                         self.report_download_page(username, start_index)
1705
1706                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1707
1708                         try:
1709                                 page = urllib2.urlopen(request).read()
1710                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1711                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1712                                 return
1713
1714                         # Extract video identifiers
1715                         ids_in_page = []
1716
1717                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718                                 if mobj.group(1) not in ids_in_page:
1719                                         ids_in_page.append(mobj.group(1))
1720
1721                         video_ids.extend(ids_in_page)
1722
1723                         # A little optimization - if current page is not
1724                         # "full", ie. does not contain PAGE_SIZE video ids then
1725                         # we can assume that this page is the last one - there
1726                         # are no more ids on further pages - no need to query
1727                         # again.
1728
1729                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1730                                 break
1731
1732                         pagenum += 1
1733
1734                 all_ids_count = len(video_ids)
1735                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1736                 playlistend = self._downloader.params.get('playlistend', -1)
1737
1738                 if playlistend == -1:
1739                         video_ids = video_ids[playliststart:]
1740                 else:
1741                         video_ids = video_ids[playliststart:playlistend]
1742
1743                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1744                                 (username, all_ids_count, len(video_ids)))
1745
1746                 for video_id in video_ids:
1747                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
1748
1749
1750 class DepositFilesIE(InfoExtractor):
1751         """Information extractor for depositfiles.com"""
1752
1753         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1754         IE_NAME = u'DepositFiles'
1755
1756         def __init__(self, downloader=None):
1757                 InfoExtractor.__init__(self, downloader)
1758
1759         def report_download_webpage(self, file_id):
1760                 """Report webpage download."""
1761                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1762
1763         def report_extraction(self, file_id):
1764                 """Report information extraction."""
1765                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1766
1767         def _real_extract(self, url):
1768                 # At this point we have a new file
1769                 self._downloader.increment_downloads()
1770
1771                 file_id = url.split('/')[-1]
1772                 # Rebuild url in english locale
1773                 url = 'http://depositfiles.com/en/files/' + file_id
1774
1775                 # Retrieve file webpage with 'Free download' button pressed
1776                 free_download_indication = { 'gateway_result' : '1' }
1777                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1778                 try:
1779                         self.report_download_webpage(file_id)
1780                         webpage = urllib2.urlopen(request).read()
1781                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1782                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1783                         return
1784
1785                 # Search for the real file URL
1786                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1787                 if (mobj is None) or (mobj.group(1) is None):
1788                         # Try to figure out reason of the error.
1789                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1790                         if (mobj is not None) and (mobj.group(1) is not None):
1791                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1792                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1793                         else:
1794                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1795                         return
1796
1797                 file_url = mobj.group(1)
1798                 file_extension = os.path.splitext(file_url)[1][1:]
1799
1800                 # Search for file title
1801                 mobj = re.search(r'<b title="(.*?)">', webpage)
1802                 if mobj is None:
1803                         self._downloader.trouble(u'ERROR: unable to extract title')
1804                         return
1805                 file_title = mobj.group(1).decode('utf-8')
1806
1807                 try:
1808                         # Process file information
1809                         self._downloader.process_info({
1810                                 'id':           file_id.decode('utf-8'),
1811                                 'url':          file_url.decode('utf-8'),
1812                                 'uploader':     u'NA',
1813                                 'upload_date':  u'NA',
1814                                 'title':        file_title,
1815                                 'stitle':       file_title,
1816                                 'ext':          file_extension.decode('utf-8'),
1817                                 'format':       u'NA',
1818                                 'player_url':   None,
1819                         })
1820                 except UnavailableVideoError, err:
1821                         self._downloader.trouble(u'ERROR: unable to download file')
1822
1823
1824 class FacebookIE(InfoExtractor):
1825         """Information Extractor for Facebook"""
1826
1827         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1828         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1829         _NETRC_MACHINE = 'facebook'
1830         _available_formats = ['video', 'highqual', 'lowqual']
1831         _video_extensions = {
1832                 'video': 'mp4',
1833                 'highqual': 'mp4',
1834                 'lowqual': 'mp4',
1835         }
1836         IE_NAME = u'facebook'
1837
1838         def __init__(self, downloader=None):
1839                 InfoExtractor.__init__(self, downloader)
1840
1841         def _reporter(self, message):
1842                 """Add header and report message."""
1843                 self._downloader.to_screen(u'[facebook] %s' % message)
1844
1845         def report_login(self):
1846                 """Report attempt to log in."""
1847                 self._reporter(u'Logging in')
1848
1849         def report_video_webpage_download(self, video_id):
1850                 """Report attempt to download video webpage."""
1851                 self._reporter(u'%s: Downloading video webpage' % video_id)
1852
1853         def report_information_extraction(self, video_id):
1854                 """Report attempt to extract video information."""
1855                 self._reporter(u'%s: Extracting video information' % video_id)
1856
1857         def _parse_page(self, video_webpage):
1858                 """Extract video information from page"""
1859                 # General data
1860                 data = {'title': r'\("video_title", "(.*?)"\)',
1861                         'description': r'<div class="datawrap">(.*?)</div>',
1862                         'owner': r'\("video_owner_name", "(.*?)"\)',
1863                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1864                         }
1865                 video_info = {}
1866                 for piece in data.keys():
1867                         mobj = re.search(data[piece], video_webpage)
1868                         if mobj is not None:
1869                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1870
1871                 # Video urls
1872                 video_urls = {}
1873                 for fmt in self._available_formats:
1874                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1875                         if mobj is not None:
1876                                 # URL is in a Javascript segment inside an escaped Unicode format within
1877                                 # the generally utf-8 page
1878                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1879                 video_info['video_urls'] = video_urls
1880
1881                 return video_info
1882
1883         def _real_initialize(self):
1884                 if self._downloader is None:
1885                         return
1886
1887                 useremail = None
1888                 password = None
1889                 downloader_params = self._downloader.params
1890
1891                 # Attempt to use provided username and password or .netrc data
1892                 if downloader_params.get('username', None) is not None:
1893                         useremail = downloader_params['username']
1894                         password = downloader_params['password']
1895                 elif downloader_params.get('usenetrc', False):
1896                         try:
1897                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1898                                 if info is not None:
1899                                         useremail = info[0]
1900                                         password = info[2]
1901                                 else:
1902                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1903                         except (IOError, netrc.NetrcParseError), err:
1904                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1905                                 return
1906
1907                 if useremail is None:
1908                         return
1909
1910                 # Log in
1911                 login_form = {
1912                         'email': useremail,
1913                         'pass': password,
1914                         'login': 'Log+In'
1915                         }
1916                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1917                 try:
1918                         self.report_login()
1919                         login_results = urllib2.urlopen(request).read()
1920                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1921                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1922                                 return
1923                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1924                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1925                         return
1926
1927         def _real_extract(self, url):
1928                 mobj = re.match(self._VALID_URL, url)
1929                 if mobj is None:
1930                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1931                         return
1932                 video_id = mobj.group('ID')
1933
1934                 # Get video webpage
1935                 self.report_video_webpage_download(video_id)
1936                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1937                 try:
1938                         page = urllib2.urlopen(request)
1939                         video_webpage = page.read()
1940                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1941                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1942                         return
1943
1944                 # Start extracting information
1945                 self.report_information_extraction(video_id)
1946
1947                 # Extract information
1948                 video_info = self._parse_page(video_webpage)
1949
1950                 # uploader
1951                 if 'owner' not in video_info:
1952                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1953                         return
1954                 video_uploader = video_info['owner']
1955
1956                 # title
1957                 if 'title' not in video_info:
1958                         self._downloader.trouble(u'ERROR: unable to extract video title')
1959                         return
1960                 video_title = video_info['title']
1961                 video_title = video_title.decode('utf-8')
1962                 video_title = sanitize_title(video_title)
1963
1964                 simple_title = simplify_title(video_title)
1965
1966                 # thumbnail image
1967                 if 'thumbnail' not in video_info:
1968                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1969                         video_thumbnail = ''
1970                 else:
1971                         video_thumbnail = video_info['thumbnail']
1972
1973                 # upload date
1974                 upload_date = u'NA'
1975                 if 'upload_date' in video_info:
1976                         upload_time = video_info['upload_date']
1977                         timetuple = email.utils.parsedate_tz(upload_time)
1978                         if timetuple is not None:
1979                                 try:
1980                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1981                                 except:
1982                                         pass
1983
1984                 # description
1985                 video_description = video_info.get('description', 'No description available.')
1986
1987                 url_map = video_info['video_urls']
1988                 if len(url_map.keys()) > 0:
1989                         # Decide which formats to download
1990                         req_format = self._downloader.params.get('format', None)
1991                         format_limit = self._downloader.params.get('format_limit', None)
1992
1993                         if format_limit is not None and format_limit in self._available_formats:
1994                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1995                         else:
1996                                 format_list = self._available_formats
1997                         existing_formats = [x for x in format_list if x in url_map]
1998                         if len(existing_formats) == 0:
1999                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2000                                 return
2001                         if req_format is None:
2002                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2003                         elif req_format == 'worst':
2004                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2005                         elif req_format == '-1':
2006                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2007                         else:
2008                                 # Specific format
2009                                 if req_format not in url_map:
2010                                         self._downloader.trouble(u'ERROR: requested format not available')
2011                                         return
2012                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2013
2014                 for format_param, video_real_url in video_url_list:
2015
2016                         # At this point we have a new video
2017                         self._downloader.increment_downloads()
2018
2019                         # Extension
2020                         video_extension = self._video_extensions.get(format_param, 'mp4')
2021
2022                         try:
2023                                 # Process video information
2024                                 self._downloader.process_info({
2025                                         'id':           video_id.decode('utf-8'),
2026                                         'url':          video_real_url.decode('utf-8'),
2027                                         'uploader':     video_uploader.decode('utf-8'),
2028                                         'upload_date':  upload_date,
2029                                         'title':        video_title,
2030                                         'stitle':       simple_title,
2031                                         'ext':          video_extension.decode('utf-8'),
2032                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2033                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2034                                         'description':  video_description.decode('utf-8'),
2035                                         'player_url':   None,
2036                                 })
2037                         except UnavailableVideoError, err:
2038                                 self._downloader.trouble(u'\nERROR: unable to download video')
2039
2040 class BlipTVIE(InfoExtractor):
2041         """Information extractor for blip.tv"""
2042
2043         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2044         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2045         IE_NAME = u'blip.tv'
2046
2047         def report_extraction(self, file_id):
2048                 """Report information extraction."""
2049                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2050
2051         def report_direct_download(self, title):
2052                 """Report information extraction."""
2053                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2054
2055         def _real_extract(self, url):
2056                 mobj = re.match(self._VALID_URL, url)
2057                 if mobj is None:
2058                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2059                         return
2060
2061                 if '?' in url:
2062                         cchar = '&'
2063                 else:
2064                         cchar = '?'
2065                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2066                 request = urllib2.Request(json_url)
2067                 self.report_extraction(mobj.group(1))
2068                 info = None
2069                 try:
2070                         urlh = urllib2.urlopen(request)
2071                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2072                                 basename = url.split('/')[-1]
2073                                 title,ext = os.path.splitext(basename)
2074                                 title = title.decode('UTF-8')
2075                                 ext = ext.replace('.', '')
2076                                 self.report_direct_download(title)
2077                                 info = {
2078                                         'id': title,
2079                                         'url': url,
2080                                         'title': title,
2081                                         'stitle': simplify_title(title),
2082                                         'ext': ext,
2083                                         'urlhandle': urlh
2084                                 }
2085                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2086                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2087                         return
2088                 if info is None: # Regular URL
2089                         try:
2090                                 json_code = urlh.read()
2091                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2092                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2093                                 return
2094
2095                         try:
2096                                 json_data = json.loads(json_code)
2097                                 if 'Post' in json_data:
2098                                         data = json_data['Post']
2099                                 else:
2100                                         data = json_data
2101
2102                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2103                                 video_url = data['media']['url']
2104                                 umobj = re.match(self._URL_EXT, video_url)
2105                                 if umobj is None:
2106                                         raise ValueError('Can not determine filename extension')
2107                                 ext = umobj.group(1)
2108
2109                                 info = {
2110                                         'id': data['item_id'],
2111                                         'url': video_url,
2112                                         'uploader': data['display_name'],
2113                                         'upload_date': upload_date,
2114                                         'title': data['title'],
2115                                         'stitle': simplify_title(data['title']),
2116                                         'ext': ext,
2117                                         'format': data['media']['mimeType'],
2118                                         'thumbnail': data['thumbnailUrl'],
2119                                         'description': data['description'],
2120                                         'player_url': data['embedUrl']
2121                                 }
2122                         except (ValueError,KeyError), err:
2123                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2124                                 return
2125
2126                 self._downloader.increment_downloads()
2127
2128                 try:
2129                         self._downloader.process_info(info)
2130                 except UnavailableVideoError, err:
2131                         self._downloader.trouble(u'\nERROR: unable to download video')
2132
2133
2134 class MyVideoIE(InfoExtractor):
2135         """Information Extractor for myvideo.de."""
2136
2137         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2138         IE_NAME = u'myvideo'
2139
2140         def __init__(self, downloader=None):
2141                 InfoExtractor.__init__(self, downloader)
2142
2143         def report_download_webpage(self, video_id):
2144                 """Report webpage download."""
2145                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2146
2147         def report_extraction(self, video_id):
2148                 """Report information extraction."""
2149                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2150
2151         def _real_extract(self,url):
2152                 mobj = re.match(self._VALID_URL, url)
2153                 if mobj is None:
2154                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2155                         return
2156
2157                 video_id = mobj.group(1)
2158
2159                 # Get video webpage
2160                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2161                 try:
2162                         self.report_download_webpage(video_id)
2163                         webpage = urllib2.urlopen(request).read()
2164                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2165                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2166                         return
2167
2168                 self.report_extraction(video_id)
2169                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2170                                  webpage)
2171                 if mobj is None:
2172                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2173                         return
2174                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2175
2176                 mobj = re.search('<title>([^<]+)</title>', webpage)
2177                 if mobj is None:
2178                         self._downloader.trouble(u'ERROR: unable to extract title')
2179                         return
2180
2181                 video_title = mobj.group(1)
2182                 video_title = sanitize_title(video_title)
2183
2184                 simple_title = simplify_title(video_title)
2185
2186                 try:
2187                         self._downloader.process_info({
2188                                 'id':           video_id,
2189                                 'url':          video_url,
2190                                 'uploader':     u'NA',
2191                                 'upload_date':  u'NA',
2192                                 'title':        video_title,
2193                                 'stitle':       simple_title,
2194                                 'ext':          u'flv',
2195                                 'format':       u'NA',
2196                                 'player_url':   None,
2197                         })
2198                 except UnavailableVideoError:
2199                         self._downloader.trouble(u'\nERROR: Unable to download video')
2200
2201 class ComedyCentralIE(InfoExtractor):
2202         """Information extractor for The Daily Show and Colbert Report """
2203
2204         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2205         IE_NAME = u'comedycentral'
2206
2207         def report_extraction(self, episode_id):
2208                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2209
2210         def report_config_download(self, episode_id):
2211                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2212
2213         def report_index_download(self, episode_id):
2214                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2215
2216         def report_player_url(self, episode_id):
2217                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2218
2219         def _real_extract(self, url):
2220                 mobj = re.match(self._VALID_URL, url)
2221                 if mobj is None:
2222                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2223                         return
2224
2225                 if mobj.group('shortname'):
2226                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2227                                 url = u'http://www.thedailyshow.com/full-episodes/'
2228                         else:
2229                                 url = u'http://www.colbertnation.com/full-episodes/'
2230                         mobj = re.match(self._VALID_URL, url)
2231                         assert mobj is not None
2232
2233                 dlNewest = not mobj.group('episode')
2234                 if dlNewest:
2235                         epTitle = mobj.group('showname')
2236                 else:
2237                         epTitle = mobj.group('episode')
2238
2239                 req = urllib2.Request(url)
2240                 self.report_extraction(epTitle)
2241                 try:
2242                         htmlHandle = urllib2.urlopen(req)
2243                         html = htmlHandle.read()
2244                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2245                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2246                         return
2247                 if dlNewest:
2248                         url = htmlHandle.geturl()
2249                         mobj = re.match(self._VALID_URL, url)
2250                         if mobj is None:
2251                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2252                                 return
2253                         if mobj.group('episode') == '':
2254                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2255                                 return
2256                         epTitle = mobj.group('episode')
2257
2258                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2259                 if len(mMovieParams) == 0:
2260                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2261                         return
2262
2263                 playerUrl_raw = mMovieParams[0][0]
2264                 self.report_player_url(epTitle)
2265                 try:
2266                         urlHandle = urllib2.urlopen(playerUrl_raw)
2267                         playerUrl = urlHandle.geturl()
2268                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2269                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2270                         return
2271
2272                 uri = mMovieParams[0][1]
2273                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2274                 self.report_index_download(epTitle)
2275                 try:
2276                         indexXml = urllib2.urlopen(indexUrl).read()
2277                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2278                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2279                         return
2280
2281                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2282                 itemEls = idoc.findall('.//item')
2283                 for itemEl in itemEls:
2284                         mediaId = itemEl.findall('./guid')[0].text
2285                         shortMediaId = mediaId.split(':')[-1]
2286                         showId = mediaId.split(':')[-2].replace('.com', '')
2287                         officialTitle = itemEl.findall('./title')[0].text
2288                         officialDate = itemEl.findall('./pubDate')[0].text
2289
2290                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2291                                                 urllib.urlencode({'uri': mediaId}))
2292                         configReq = urllib2.Request(configUrl)
2293                         self.report_config_download(epTitle)
2294                         try:
2295                                 configXml = urllib2.urlopen(configReq).read()
2296                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2297                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2298                                 return
2299
2300                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2301                         turls = []
2302                         for rendition in cdoc.findall('.//rendition'):
2303                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2304                                 turls.append(finfo)
2305
2306                         if len(turls) == 0:
2307                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2308                                 continue
2309
2310                         # For now, just pick the highest bitrate
2311                         format,video_url = turls[-1]
2312
2313                         self._downloader.increment_downloads()
2314
2315                         effTitle = showId + u'-' + epTitle
2316                         info = {
2317                                 'id': shortMediaId,
2318                                 'url': video_url,
2319                                 'uploader': showId,
2320                                 'upload_date': officialDate,
2321                                 'title': effTitle,
2322                                 'stitle': simplify_title(effTitle),
2323                                 'ext': 'mp4',
2324                                 'format': format,
2325                                 'thumbnail': None,
2326                                 'description': officialTitle,
2327                                 'player_url': playerUrl
2328                         }
2329
2330                         try:
2331                                 self._downloader.process_info(info)
2332                         except UnavailableVideoError, err:
2333                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
2334                                 continue
2335
2336
2337 class EscapistIE(InfoExtractor):
2338         """Information extractor for The Escapist """
2339
2340         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2341         IE_NAME = u'escapist'
2342
2343         def report_extraction(self, showName):
2344                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2345
2346         def report_config_download(self, showName):
2347                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2348
2349         def _real_extract(self, url):
2350                 htmlParser = HTMLParser.HTMLParser()
2351
2352                 mobj = re.match(self._VALID_URL, url)
2353                 if mobj is None:
2354                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2355                         return
2356                 showName = mobj.group('showname')
2357                 videoId = mobj.group('episode')
2358
2359                 self.report_extraction(showName)
2360                 try:
2361                         webPage = urllib2.urlopen(url).read()
2362                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2363                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2364                         return
2365
2366                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2367                 description = htmlParser.unescape(descMatch.group(1))
2368                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2369                 imgUrl = htmlParser.unescape(imgMatch.group(1))
2370                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2371                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
2372                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2373                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2374
2375                 self.report_config_download(showName)
2376                 try:
2377                         configJSON = urllib2.urlopen(configUrl).read()
2378                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2379                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2380                         return
2381
2382                 # Technically, it's JavaScript, not JSON
2383                 configJSON = configJSON.replace("'", '"')
2384
2385                 try:
2386                         config = json.loads(configJSON)
2387                 except (ValueError,), err:
2388                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2389                         return
2390
2391                 playlist = config['playlist']
2392                 videoUrl = playlist[1]['url']
2393
2394                 self._downloader.increment_downloads()
2395                 info = {
2396                         'id': videoId,
2397                         'url': videoUrl,
2398                         'uploader': showName,
2399                         'upload_date': None,
2400                         'title': showName,
2401                         'stitle': simplify_title(showName),
2402                         'ext': 'flv',
2403                         'format': 'flv',
2404                         'thumbnail': imgUrl,
2405                         'description': description,
2406                         'player_url': playerUrl,
2407                 }
2408
2409                 try:
2410                         self._downloader.process_info(info)
2411                 except UnavailableVideoError, err:
2412                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
2413
2414
2415 class CollegeHumorIE(InfoExtractor):
2416         """Information extractor for collegehumor.com"""
2417
2418         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2419         IE_NAME = u'collegehumor'
2420
2421         def report_webpage(self, video_id):
2422                 """Report information extraction."""
2423                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2424
2425         def report_extraction(self, video_id):
2426                 """Report information extraction."""
2427                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2428
2429         def _real_extract(self, url):
2430                 htmlParser = HTMLParser.HTMLParser()
2431
2432                 mobj = re.match(self._VALID_URL, url)
2433                 if mobj is None:
2434                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2435                         return
2436                 video_id = mobj.group('videoid')
2437
2438                 self.report_webpage(video_id)
2439                 request = urllib2.Request(url)
2440                 try:
2441                         webpage = urllib2.urlopen(request).read()
2442                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2443                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2444                         return
2445
2446                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2447                 if m is None:
2448                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2449                         return
2450                 internal_video_id = m.group('internalvideoid')
2451
2452                 info = {
2453                         'id': video_id,
2454                         'internal_id': internal_video_id,
2455                 }
2456
2457                 self.report_extraction(video_id)
2458                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2459                 try:
2460                         metaXml = urllib2.urlopen(xmlUrl).read()
2461                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2462                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2463                         return
2464
2465                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2466                 try:
2467                         videoNode = mdoc.findall('./video')[0]
2468                         info['description'] = videoNode.findall('./description')[0].text
2469                         info['title'] = videoNode.findall('./caption')[0].text
2470                         info['stitle'] = simplify_title(info['title'])
2471                         info['url'] = videoNode.findall('./file')[0].text
2472                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2473                         info['ext'] = info['url'].rpartition('.')[2]
2474                         info['format'] = info['ext']
2475                 except IndexError:
2476                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2477                         return
2478
2479                 self._downloader.increment_downloads()
2480
2481                 try:
2482                         self._downloader.process_info(info)
2483                 except UnavailableVideoError, err:
2484                         self._downloader.trouble(u'\nERROR: unable to download video')
2485
2486
2487 class XVideosIE(InfoExtractor):
2488         """Information extractor for xvideos.com"""
2489
2490         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2491         IE_NAME = u'xvideos'
2492
2493         def report_webpage(self, video_id):
2494                 """Report information extraction."""
2495                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2496
2497         def report_extraction(self, video_id):
2498                 """Report information extraction."""
2499                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2500
2501         def _real_extract(self, url):
2502                 htmlParser = HTMLParser.HTMLParser()
2503
2504                 mobj = re.match(self._VALID_URL, url)
2505                 if mobj is None:
2506                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2507                         return
2508                 video_id = mobj.group(1).decode('utf-8')
2509
2510                 self.report_webpage(video_id)
2511
2512                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2513                 try:
2514                         webpage = urllib2.urlopen(request).read()
2515                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2516                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2517                         return
2518
2519                 self.report_extraction(video_id)
2520
2521
2522                 # Extract video URL
2523                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2524                 if mobj is None:
2525                         self._downloader.trouble(u'ERROR: unable to extract video url')
2526                         return
2527                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2528
2529
2530                 # Extract title
2531                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2532                 if mobj is None:
2533                         self._downloader.trouble(u'ERROR: unable to extract video title')
2534                         return
2535                 video_title = mobj.group(1).decode('utf-8')
2536
2537
2538                 # Extract video thumbnail
2539                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2540                 if mobj is None:
2541                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2542                         return
2543                 video_thumbnail = mobj.group(1).decode('utf-8')
2544
2545
2546
2547                 self._downloader.increment_downloads()
2548                 info = {
2549                         'id': video_id,
2550                         'url': video_url,
2551                         'uploader': None,
2552                         'upload_date': None,
2553                         'title': video_title,
2554                         'stitle': simplify_title(video_title),
2555                         'ext': 'flv',
2556                         'format': 'flv',
2557                         'thumbnail': video_thumbnail,
2558                         'description': None,
2559                         'player_url': None,
2560                 }
2561
2562                 try:
2563                         self._downloader.process_info(info)
2564                 except UnavailableVideoError, err:
2565                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
2566
2567
2568 class SoundcloudIE(InfoExtractor):
2569         """Information extractor for soundcloud.com
2570            To access the media, the uid of the song and a stream token
2571            must be extracted from the page source and the script must make
2572            a request to media.soundcloud.com/crossdomain.xml. Then
2573            the media can be grabbed by requesting from an url composed
2574            of the stream token and uid
2575          """
2576
2577         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2578         IE_NAME = u'soundcloud'
2579
2580         def __init__(self, downloader=None):
2581                 InfoExtractor.__init__(self, downloader)
2582
2583         def report_webpage(self, video_id):
2584                 """Report information extraction."""
2585                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2586
2587         def report_extraction(self, video_id):
2588                 """Report information extraction."""
2589                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2590
2591         def _real_extract(self, url):
2592                 htmlParser = HTMLParser.HTMLParser()
2593
2594                 mobj = re.match(self._VALID_URL, url)
2595                 if mobj is None:
2596                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2597                         return
2598
2599                 # extract uploader (which is in the url)
2600                 uploader = mobj.group(1).decode('utf-8')
2601                 # extract simple title (uploader + slug of song title)
2602                 slug_title =  mobj.group(2).decode('utf-8')
2603                 simple_title = uploader + '-' + slug_title
2604
2605                 self.report_webpage('%s/%s' % (uploader, slug_title))
2606
2607                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2608                 try:
2609                         webpage = urllib2.urlopen(request).read()
2610                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2611                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2612                         return
2613
2614                 self.report_extraction('%s/%s' % (uploader, slug_title))
2615
2616                 # extract uid and stream token that soundcloud hands out for access
2617                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2618                 if mobj:
2619                         video_id = mobj.group(1)
2620                         stream_token = mobj.group(2)
2621
2622                 # extract unsimplified title
2623                 mobj = re.search('"title":"(.*?)",', webpage)
2624                 if mobj:
2625                         title = mobj.group(1)
2626
2627                 # construct media url (with uid/token)
2628                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2629                 mediaURL = mediaURL % (video_id, stream_token)
2630
2631                 # description
2632                 description = u'No description available'
2633                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2634                 if mobj:
2635                         description = mobj.group(1)
2636
2637                 # upload date
2638                 upload_date = None
2639                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2640                 if mobj:
2641                         try:
2642                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2643                         except Exception, e:
2644                                 print str(e)
2645
2646                 # for soundcloud, a request to a cross domain is required for cookies
2647                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2648
2649                 try:
2650                         self._downloader.process_info({
2651                                 'id':           video_id.decode('utf-8'),
2652                                 'url':          mediaURL,
2653                                 'uploader':     uploader.decode('utf-8'),
2654                                 'upload_date':  upload_date,
2655                                 'title':        simple_title.decode('utf-8'),
2656                                 'stitle':       simple_title.decode('utf-8'),
2657                                 'ext':          u'mp3',
2658                                 'format':       u'NA',
2659                                 'player_url':   None,
2660                                 'description': description.decode('utf-8')
2661                         })
2662                 except UnavailableVideoError:
2663                         self._downloader.trouble(u'\nERROR: unable to download video')
2664
2665
2666 class InfoQIE(InfoExtractor):
2667         """Information extractor for infoq.com"""
2668
2669         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2670         IE_NAME = u'infoq'
2671
2672         def report_webpage(self, video_id):
2673                 """Report information extraction."""
2674                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2675
2676         def report_extraction(self, video_id):
2677                 """Report information extraction."""
2678                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2679
2680         def _real_extract(self, url):
2681                 htmlParser = HTMLParser.HTMLParser()
2682
2683                 mobj = re.match(self._VALID_URL, url)
2684                 if mobj is None:
2685                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2686                         return
2687
2688                 self.report_webpage(url)
2689
2690                 request = urllib2.Request(url)
2691                 try:
2692                         webpage = urllib2.urlopen(request).read()
2693                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2694                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2695                         return
2696
2697                 self.report_extraction(url)
2698
2699
2700                 # Extract video URL
2701                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2702                 if mobj is None:
2703                         self._downloader.trouble(u'ERROR: unable to extract video url')
2704                         return
2705                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2706
2707
2708                 # Extract title
2709                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2710                 if mobj is None:
2711                         self._downloader.trouble(u'ERROR: unable to extract video title')
2712                         return
2713                 video_title = mobj.group(1).decode('utf-8')
2714
2715                 # Extract description
2716                 video_description = u'No description available.'
2717                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2718                 if mobj is not None:
2719                         video_description = mobj.group(1).decode('utf-8')
2720
2721                 video_filename = video_url.split('/')[-1]
2722                 video_id, extension = video_filename.split('.')
2723
2724                 self._downloader.increment_downloads()
2725                 info = {
2726                         'id': video_id,
2727                         'url': video_url,
2728                         'uploader': None,
2729                         'upload_date': None,
2730                         'title': video_title,
2731                         'stitle': simplify_title(video_title),
2732                         'ext': extension,
2733                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2734                         'thumbnail': None,
2735                         'description': video_description,
2736                         'player_url': None,
2737                 }
2738
2739                 try:
2740                         self._downloader.process_info(info)
2741                 except UnavailableVideoError, err:
2742                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
2743
2744 class MixcloudIE(InfoExtractor):
2745         """Information extractor for www.mixcloud.com"""
2746         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2747         IE_NAME = u'mixcloud'
2748
2749         def __init__(self, downloader=None):
2750                 InfoExtractor.__init__(self, downloader)
2751
2752         def report_download_json(self, file_id):
2753                 """Report JSON download."""
2754                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2755
2756         def report_extraction(self, file_id):
2757                 """Report information extraction."""
2758                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2759
2760         def get_urls(self, jsonData, fmt, bitrate='best'):
2761                 """Get urls from 'audio_formats' section in json"""
2762                 file_url = None
2763                 try:
2764                         bitrate_list = jsonData[fmt]
2765                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2766                                 bitrate = max(bitrate_list) # select highest
2767
2768                         url_list = jsonData[fmt][bitrate]
2769                 except TypeError: # we have no bitrate info.
2770                         url_list = jsonData[fmt]
2771
2772                 return url_list
2773
2774         def check_urls(self, url_list):
2775                 """Returns 1st active url from list"""
2776                 for url in url_list:
2777                         try:
2778                                 urllib2.urlopen(url)
2779                                 return url
2780                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2781                                 url = None
2782
2783                 return None
2784
2785         def _print_formats(self, formats):
2786                 print 'Available formats:'
2787                 for fmt in formats.keys():
2788                         for b in formats[fmt]:
2789                                 try:
2790                                         ext = formats[fmt][b][0]
2791                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2792                                 except TypeError: # we have no bitrate info
2793                                         ext = formats[fmt][0]
2794                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2795                                         break
2796
2797         def _real_extract(self, url):
2798                 mobj = re.match(self._VALID_URL, url)
2799                 if mobj is None:
2800                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2801                         return
2802                 # extract uploader & filename from url
2803                 uploader = mobj.group(1).decode('utf-8')
2804                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2805
2806                 # construct API request
2807                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2808                 # retrieve .json file with links to files
2809                 request = urllib2.Request(file_url)
2810                 try:
2811                         self.report_download_json(file_url)
2812                         jsonData = urllib2.urlopen(request).read()
2813                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2814                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2815                         return
2816
2817                 # parse JSON
2818                 json_data = json.loads(jsonData)
2819                 player_url = json_data['player_swf_url']
2820                 formats = dict(json_data['audio_formats'])
2821
2822                 req_format = self._downloader.params.get('format', None)
2823                 bitrate = None
2824
2825                 if self._downloader.params.get('listformats', None):
2826                         self._print_formats(formats)
2827                         return
2828
2829                 if req_format is None or req_format == 'best':
2830                         for format_param in formats.keys():
2831                                 url_list = self.get_urls(formats, format_param)
2832                                 # check urls
2833                                 file_url = self.check_urls(url_list)
2834                                 if file_url is not None:
2835                                         break # got it!
2836                 else:
2837                         if req_format not in formats.keys():
2838                                 self._downloader.trouble(u'ERROR: format is not available')
2839                                 return
2840
2841                         url_list = self.get_urls(formats, req_format)
2842                         file_url = self.check_urls(url_list)
2843                         format_param = req_format
2844
2845                 # We have audio
2846                 self._downloader.increment_downloads()
2847                 try:
2848                         # Process file information
2849                         self._downloader.process_info({
2850                                 'id': file_id.decode('utf-8'),
2851                                 'url': file_url.decode('utf-8'),
2852                                 'uploader':     uploader.decode('utf-8'),
2853                                 'upload_date': u'NA',
2854                                 'title': json_data['name'],
2855                                 'stitle': simplify_title(json_data['name']),
2856                                 'ext': file_url.split('.')[-1].decode('utf-8'),
2857                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2858                                 'thumbnail': json_data['thumbnail_url'],
2859                                 'description': json_data['description'],
2860                                 'player_url': player_url.decode('utf-8'),
2861                         })
2862                 except UnavailableVideoError, err:
2863                         self._downloader.trouble(u'ERROR: unable to download file')
2864
2865 class StanfordOpenClassroomIE(InfoExtractor):
2866         """Information extractor for Stanford's Open ClassRoom"""
2867
2868         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2869         IE_NAME = u'stanfordoc'
2870
2871         def report_download_webpage(self, objid):
2872                 """Report information extraction."""
2873                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2874
2875         def report_extraction(self, video_id):
2876                 """Report information extraction."""
2877                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2878
2879         def _real_extract(self, url):
2880                 mobj = re.match(self._VALID_URL, url)
2881                 if mobj is None:
2882                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2883                         return
2884
2885                 if mobj.group('course') and mobj.group('video'): # A specific video
2886                         course = mobj.group('course')
2887                         video = mobj.group('video')
2888                         info = {
2889                                 'id': simplify_title(course + '_' + video),
2890                         }
2891
2892                         self.report_extraction(info['id'])
2893                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2894                         xmlUrl = baseUrl + video + '.xml'
2895                         try:
2896                                 metaXml = urllib2.urlopen(xmlUrl).read()
2897                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2898                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2899                                 return
2900                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2901                         try:
2902                                 info['title'] = mdoc.findall('./title')[0].text
2903                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2904                         except IndexError:
2905                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2906                                 return
2907                         info['stitle'] = simplify_title(info['title'])
2908                         info['ext'] = info['url'].rpartition('.')[2]
2909                         info['format'] = info['ext']
2910                         self._downloader.increment_downloads()
2911                         try:
2912                                 self._downloader.process_info(info)
2913                         except UnavailableVideoError, err:
2914                                 self._downloader.trouble(u'\nERROR: unable to download video')
2915                 elif mobj.group('course'): # A course page
2916                         unescapeHTML = HTMLParser.HTMLParser().unescape
2917
2918                         course = mobj.group('course')
2919                         info = {
2920                                 'id': simplify_title(course),
2921                                 'type': 'playlist',
2922                         }
2923
2924                         self.report_download_webpage(info['id'])
2925                         try:
2926                                 coursepage = urllib2.urlopen(url).read()
2927                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2928                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2929                                 return
2930
2931                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2932                         if m:
2933                                 info['title'] = unescapeHTML(m.group(1))
2934                         else:
2935                                 info['title'] = info['id']
2936                         info['stitle'] = simplify_title(info['title'])
2937
2938                         m = re.search('<description>([^<]+)</description>', coursepage)
2939                         if m:
2940                                 info['description'] = unescapeHTML(m.group(1))
2941
2942                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2943                         info['list'] = [
2944                                 {
2945                                         'type': 'reference',
2946                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2947                                 }
2948                                         for vpage in links]
2949
2950                         for entry in info['list']:
2951                                 assert entry['type'] == 'reference'
2952                                 self.extract(entry['url'])
2953                 else: # Root page
2954                         unescapeHTML = HTMLParser.HTMLParser().unescape
2955
2956                         info = {
2957                                 'id': 'Stanford OpenClassroom',
2958                                 'type': 'playlist',
2959                         }
2960
2961                         self.report_download_webpage(info['id'])
2962                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2963                         try:
2964                                 rootpage = urllib2.urlopen(rootURL).read()
2965                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2966                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2967                                 return
2968
2969                         info['title'] = info['id']
2970                         info['stitle'] = simplify_title(info['title'])
2971
2972                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2973                         info['list'] = [
2974                                 {
2975                                         'type': 'reference',
2976                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2977                                 }
2978                                         for cpage in links]
2979
2980                         for entry in info['list']:
2981                                 assert entry['type'] == 'reference'
2982                                 self.extract(entry['url'])
2983
2984 class MTVIE(InfoExtractor):
2985         """Information extractor for MTV.com"""
2986
2987         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2988         IE_NAME = u'mtv'
2989
2990         def report_webpage(self, video_id):
2991                 """Report information extraction."""
2992                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2993
2994         def report_extraction(self, video_id):
2995                 """Report information extraction."""
2996                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2997
2998         def _real_extract(self, url):
2999                 mobj = re.match(self._VALID_URL, url)
3000                 if mobj is None:
3001                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3002                         return
3003                 if not mobj.group('proto'):
3004                         url = 'http://' + url
3005                 video_id = mobj.group('videoid')
3006                 self.report_webpage(video_id)
3007
3008                 request = urllib2.Request(url)
3009                 try:
3010                         webpage = urllib2.urlopen(request).read()
3011                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3012                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3013                         return
3014
3015                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3016                 if mobj is None:
3017                         self._downloader.trouble(u'ERROR: unable to extract song name')
3018                         return
3019                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3020                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3021                 if mobj is None:
3022                         self._downloader.trouble(u'ERROR: unable to extract performer')
3023                         return
3024                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3025                 video_title = performer + ' - ' + song_name
3026
3027                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3028                 if mobj is None:
3029                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3030                         return
3031                 mtvn_uri = mobj.group(1)
3032
3033                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3034                 if mobj is None:
3035                         self._downloader.trouble(u'ERROR: unable to extract content id')
3036                         return
3037                 content_id = mobj.group(1)
3038
3039                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3040                 self.report_extraction(video_id)
3041                 request = urllib2.Request(videogen_url)
3042                 try:
3043                         metadataXml = urllib2.urlopen(request).read()
3044                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3045                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3046                         return
3047
3048                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3049                 renditions = mdoc.findall('.//rendition')
3050
3051                 # For now, always pick the highest quality.
3052                 rendition = renditions[-1]
3053
3054                 try:
3055                         _,_,ext = rendition.attrib['type'].partition('/')
3056                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3057                         video_url = rendition.find('./src').text
3058                 except KeyError:
3059                         self._downloader.trouble('Invalid rendition field.')
3060                         return
3061
3062                 self._downloader.increment_downloads()
3063                 info = {
3064                         'id': video_id,
3065                         'url': video_url,
3066                         'uploader': performer,
3067                         'title': video_title,
3068                         'stitle': simplify_title(video_title),
3069                         'ext': ext,
3070                         'format': format,
3071                 }
3072
3073                 try:
3074                         self._downloader.process_info(info)
3075                 except UnavailableVideoError, err:
3076                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)