youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 from urlparse import parse_qs
  17
  18 try:
  19         import cStringIO as StringIO
  20 except ImportError:
  21         import StringIO
  22
  23 from utils import *
  24
  25
  26 class InfoExtractor(object):
  27         """Information Extractor class.
  28
  29         Information extractors are the classes that, given a URL, extract
  30         information from the video (or videos) the URL refers to. This
  31         information includes the real video URL, the video title and simplified
  32         title, author and others. The information is stored in a dictionary
  33         which is then passed to the FileDownloader. The FileDownloader
  34         processes this information possibly downloading the video to the file
  35         system, among other possible outcomes. The dictionaries must include
  36         the following fields:
  37
  38         id:             Video identifier.
  39         url:            Final video URL.
  40         uploader:       Nickname of the video uploader.
  41         title:          Literal title.
  42         stitle:         Simplified title.
  43         ext:            Video filename extension.
  44         format:         Video format.
  45         player_url:     SWF Player URL (may be None).
  46
  47         The following fields are optional. Their primary purpose is to allow
  48         youtube-dl to serve as the backend for a video search function, such
  49         as the one in youtube2mp3.  They are only used when their respective
  50         forced printing functions are called:
  51
  52         thumbnail:      Full URL to a video thumbnail image.
  53         description:    One-line video description.
  54
  55         Subclasses of this one should re-define the _real_initialize() and
  56         _real_extract() methods and define a _VALID_URL regexp.
  57         Probably, they should also be added to the list of extractors.
  58         """
  59
  60         _ready = False
  61         _downloader = None
  62
  63         def __init__(self, downloader=None):
  64                 """Constructor. Receives an optional downloader."""
  65                 self._ready = False
  66                 self.set_downloader(downloader)
  67
  68         def suitable(self, url):
  69                 """Receives a URL and returns True if suitable for this IE."""
  70                 return re.match(self._VALID_URL, url) is not None
  71
  72         def initialize(self):
  73                 """Initializes an instance (authentication, etc)."""
  74                 if not self._ready:
  75                         self._real_initialize()
  76                         self._ready = True
  77
  78         def extract(self, url):
  79                 """Extracts URL information and returns it in list of dicts."""
  80                 self.initialize()
  81                 return self._real_extract(url)
  82
  83         def set_downloader(self, downloader):
  84                 """Sets the downloader for this IE."""
  85                 self._downloader = downloader
  86
  87         def _real_initialize(self):
  88                 """Real initialization process. Redefine in subclasses."""
  89                 pass
  90
  91         def _real_extract(self, url):
  92                 """Real extraction process. Redefine in subclasses."""
  93                 pass
  94
  95
  96 class YoutubeIE(InfoExtractor):
  97         """Information extractor for youtube.com."""
  98
  99         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
 100         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 101         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 102         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 103         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 104         _NETRC_MACHINE = 'youtube'
 105         # Listed in order of quality
 106         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 107         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 108         _video_extensions = {
 109                 '13': '3gp',
 110                 '17': 'mp4',
 111                 '18': 'mp4',
 112                 '22': 'mp4',
 113                 '37': 'mp4',
 114                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 115                 '43': 'webm',
 116                 '44': 'webm',
 117                 '45': 'webm',
 118         }
 119         _video_dimensions = {
 120                 '5': '240x400',
 121                 '6': '???',
 122                 '13': '???',
 123                 '17': '144x176',
 124                 '18': '360x640',
 125                 '22': '720x1280',
 126                 '34': '360x640',
 127                 '35': '480x854',
 128                 '37': '1080x1920',
 129                 '38': '3072x4096',
 130                 '43': '360x640',
 131                 '44': '480x854',
 132                 '45': '720x1280',
 133         }
 134         IE_NAME = u'youtube'
 135
 136         def report_lang(self):
 137                 """Report attempt to set language."""
 138                 self._downloader.to_screen(u'[youtube] Setting language')
 139
 140         def report_login(self):
 141                 """Report attempt to log in."""
 142                 self._downloader.to_screen(u'[youtube] Logging in')
 143
 144         def report_age_confirmation(self):
 145                 """Report attempt to confirm age."""
 146                 self._downloader.to_screen(u'[youtube] Confirming age')
 147
 148         def report_video_webpage_download(self, video_id):
 149                 """Report attempt to download video webpage."""
 150                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 151
 152         def report_video_info_webpage_download(self, video_id):
 153                 """Report attempt to download video info webpage."""
 154                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 155
 156         def report_video_subtitles_download(self, video_id):
 157                 """Report attempt to download video info webpage."""
 158                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 159
 160         def report_information_extraction(self, video_id):
 161                 """Report attempt to extract video information."""
 162                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 163
 164         def report_unavailable_format(self, video_id, format):
 165                 """Report extracted video URL."""
 166                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 167
 168         def report_rtmp_download(self):
 169                 """Indicate the download will use the RTMP protocol."""
 170                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 171
 172         def _closed_captions_xml_to_srt(self, xml_string):
 173                 srt = ''
 174                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 175                 # TODO parse xml instead of regex
 176                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 177                         if not dur: dur = '4'
 178                         start = float(start)
 179                         end = start + float(dur)
 180                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 181                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 182                         caption = unescapeHTML(caption)
 183                         caption = unescapeHTML(caption) # double cycle, inentional
 184                         srt += str(n) + '\n'
 185                         srt += start + ' --> ' + end + '\n'
 186                         srt += caption + '\n\n'
 187                 return srt
 188
 189         def _print_formats(self, formats):
 190                 print 'Available formats:'
 191                 for x in formats:
 192                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 193
 194         def _real_initialize(self):
 195                 if self._downloader is None:
 196                         return
 197
 198                 username = None
 199                 password = None
 200                 downloader_params = self._downloader.params
 201
 202                 # Attempt to use provided username and password or .netrc data
 203                 if downloader_params.get('username', None) is not None:
 204                         username = downloader_params['username']
 205                         password = downloader_params['password']
 206                 elif downloader_params.get('usenetrc', False):
 207                         try:
 208                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 209                                 if info is not None:
 210                                         username = info[0]
 211                                         password = info[2]
 212                                 else:
 213                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 214                         except (IOError, netrc.NetrcParseError), err:
 215                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 216                                 return
 217
 218                 # Set language
 219                 request = urllib2.Request(self._LANG_URL)
 220                 try:
 221                         self.report_lang()
 222                         urllib2.urlopen(request).read()
 223                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 224                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 225                         return
 226
 227                 # No authentication to be performed
 228                 if username is None:
 229                         return
 230
 231                 # Log in
 232                 login_form = {
 233                                 'current_form': 'loginForm',
 234                                 'next':         '/',
 235                                 'action_login': 'Log In',
 236                                 'username':     username,
 237                                 'password':     password,
 238                                 }
 239                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 240                 try:
 241                         self.report_login()
 242                         login_results = urllib2.urlopen(request).read()
 243                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 244                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 245                                 return
 246                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 247                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 248                         return
 249
 250                 # Confirm age
 251                 age_form = {
 252                                 'next_url':             '/',
 253                                 'action_confirm':       'Confirm',
 254                                 }
 255                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 256                 try:
 257                         self.report_age_confirmation()
 258                         age_results = urllib2.urlopen(request).read()
 259                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 260                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 261                         return
 262
 263         def _real_extract(self, url):
 264                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 265                 mobj = re.search(self._NEXT_URL_RE, url)
 266                 if mobj:
 267                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 268
 269                 # Extract video id from URL
 270                 mobj = re.match(self._VALID_URL, url)
 271                 if mobj is None:
 272                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 273                         return
 274                 video_id = mobj.group(2)
 275
 276                 # Get video webpage
 277                 self.report_video_webpage_download(video_id)
 278                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 279                 try:
 280                         video_webpage = urllib2.urlopen(request).read()
 281                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 282                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 283                         return
 284
 285                 # Attempt to extract SWF player URL
 286                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 287                 if mobj is not None:
 288                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 289                 else:
 290                         player_url = None
 291
 292                 # Get video info
 293                 self.report_video_info_webpage_download(video_id)
 294                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 295                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 296                                         % (video_id, el_type))
 297                         request = urllib2.Request(video_info_url)
 298                         try:
 299                                 video_info_webpage = urllib2.urlopen(request).read()
 300                                 video_info = parse_qs(video_info_webpage)
 301                                 if 'token' in video_info:
 302                                         break
 303                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 304                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 305                                 return
 306                 if 'token' not in video_info:
 307                         if 'reason' in video_info:
 308                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 309                         else:
 310                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 311                         return
 312
 313                 # Start extracting information
 314                 self.report_information_extraction(video_id)
 315
 316                 # uploader
 317                 if 'author' not in video_info:
 318                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 319                         return
 320                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 321
 322                 # title
 323                 if 'title' not in video_info:
 324                         self._downloader.trouble(u'ERROR: unable to extract video title')
 325                         return
 326                 video_title = urllib.unquote_plus(video_info['title'][0])
 327                 video_title = video_title.decode('utf-8')
 328                 video_title = sanitize_title(video_title)
 329
 330                 # simplified title
 331                 simple_title = simplify_title(video_title)
 332
 333                 # thumbnail image
 334                 if 'thumbnail_url' not in video_info:
 335                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 336                         video_thumbnail = ''
 337                 else:   # don't panic if we can't find it
 338                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 339
 340                 # upload date
 341                 upload_date = u'NA'
 342                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 343                 if mobj is not None:
 344                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 345                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 346                         for expression in format_expressions:
 347                                 try:
 348                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 349                                 except:
 350                                         pass
 351
 352                 # description
 353                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 354                 if video_description: video_description = clean_html(video_description)
 355                 else: video_description = ''
 356
 357                 # closed captions
 358                 video_subtitles = None
 359                 if self._downloader.params.get('writesubtitles', False):
 360                         self.report_video_subtitles_download(video_id)
 361                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 362                         try:
 363                                 srt_list = urllib2.urlopen(request).read()
 364                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 365                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 366                         else:
 367                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
 368                                 if srt_lang_list:
 369                                         if self._downloader.params.get('subtitleslang', False):
 370                                                 srt_lang = self._downloader.params.get('subtitleslang')
 371                                         elif 'en' in srt_lang_list:
 372                                                 srt_lang = 'en'
 373                                         else:
 374                                                 srt_lang = srt_lang_list[0]
 375                                         if not srt_lang in srt_lang_list:
 376                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
 377                                         else:
 378                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
 379                                                 try:
 380                                                         srt_xml = urllib2.urlopen(request).read()
 381                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 382                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 383                                                 else:
 384                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 385                                 else:
 386                                         self._downloader.trouble(u'WARNING: video has no closed captions')
 387
 388                 # token
 389                 video_token = urllib.unquote_plus(video_info['token'][0])
 390
 391                 # Decide which formats to download
 392                 req_format = self._downloader.params.get('format', None)
 393
 394                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 395                         self.report_rtmp_download()
 396                         video_url_list = [(None, video_info['conn'][0])]
 397                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 398                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 399                         url_data = [parse_qs(uds) for uds in url_data_strs]
 400                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 401                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
 402
 403                         format_limit = self._downloader.params.get('format_limit', None)
 404                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 405                         if format_limit is not None and format_limit in available_formats:
 406                                 format_list = available_formats[available_formats.index(format_limit):]
 407                         else:
 408                                 format_list = available_formats
 409                         existing_formats = [x for x in format_list if x in url_map]
 410                         if len(existing_formats) == 0:
 411                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 412                                 return
 413                         if self._downloader.params.get('listformats', None):
 414                                 self._print_formats(existing_formats)
 415                                 return
 416                         if req_format is None or req_format == 'best':
 417                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 418                         elif req_format == 'worst':
 419                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 420                         elif req_format in ('-1', 'all'):
 421                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 422                         else:
 423                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 424                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 425                                 req_formats = req_format.split('/')
 426                                 video_url_list = None
 427                                 for rf in req_formats:
 428                                         if rf in url_map:
 429                                                 video_url_list = [(rf, url_map[rf])]
 430                                                 break
 431                                 if video_url_list is None:
 432                                         self._downloader.trouble(u'ERROR: requested format not available')
 433                                         return
 434                 else:
 435                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 436                         return
 437
 438                 results = []
 439                 for format_param, video_real_url in video_url_list:
 440                         # Extension
 441                         video_extension = self._video_extensions.get(format_param, 'flv')
 442
 443                         results.append({
 444                                 'id':           video_id.decode('utf-8'),
 445                                 'url':          video_real_url.decode('utf-8'),
 446                                 'uploader':     video_uploader.decode('utf-8'),
 447                                 'upload_date':  upload_date,
 448                                 'title':        video_title,
 449                                 'stitle':       simple_title,
 450                                 'ext':          video_extension.decode('utf-8'),
 451                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 452                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 453                                 'description':  video_description,
 454                                 'player_url':   player_url,
 455                                 'subtitles':    video_subtitles
 456                         })
 457                 return results
 458
 459
 460 class MetacafeIE(InfoExtractor):
 461         """Information Extractor for metacafe.com."""
 462
 463         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 464         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 465         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 466         IE_NAME = u'metacafe'
 467
 468         def __init__(self, downloader=None):
 469                 InfoExtractor.__init__(self, downloader)
 470
 471         def report_disclaimer(self):
 472                 """Report disclaimer retrieval."""
 473                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 474
 475         def report_age_confirmation(self):
 476                 """Report attempt to confirm age."""
 477                 self._downloader.to_screen(u'[metacafe] Confirming age')
 478
 479         def report_download_webpage(self, video_id):
 480                 """Report webpage download."""
 481                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 482
 483         def report_extraction(self, video_id):
 484                 """Report information extraction."""
 485                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 486
 487         def _real_initialize(self):
 488                 # Retrieve disclaimer
 489                 request = urllib2.Request(self._DISCLAIMER)
 490                 try:
 491                         self.report_disclaimer()
 492                         disclaimer = urllib2.urlopen(request).read()
 493                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 494                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 495                         return
 496
 497                 # Confirm age
 498                 disclaimer_form = {
 499                         'filters': '0',
 500                         'submit': "Continue - I'm over 18",
 501                         }
 502                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 503                 try:
 504                         self.report_age_confirmation()
 505                         disclaimer = urllib2.urlopen(request).read()
 506                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 507                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 508                         return
 509
 510         def _real_extract(self, url):
 511                 # Extract id and simplified title from URL
 512                 mobj = re.match(self._VALID_URL, url)
 513                 if mobj is None:
 514                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 515                         return
 516
 517                 video_id = mobj.group(1)
 518
 519                 # Check if video comes from YouTube
 520                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 521                 if mobj2 is not None:
 522                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 523                         return
 524
 525                 simple_title = mobj.group(2).decode('utf-8')
 526
 527                 # Retrieve video webpage to extract further information
 528                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 529                 try:
 530                         self.report_download_webpage(video_id)
 531                         webpage = urllib2.urlopen(request).read()
 532                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 533                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 534                         return
 535
 536                 # Extract URL, uploader and title from webpage
 537                 self.report_extraction(video_id)
 538                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 539                 if mobj is not None:
 540                         mediaURL = urllib.unquote(mobj.group(1))
 541                         video_extension = mediaURL[-3:]
 542
 543                         # Extract gdaKey if available
 544                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 545                         if mobj is None:
 546                                 video_url = mediaURL
 547                         else:
 548                                 gdaKey = mobj.group(1)
 549                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 550                 else:
 551                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 552                         if mobj is None:
 553                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 554                                 return
 555                         vardict = parse_qs(mobj.group(1))
 556                         if 'mediaData' not in vardict:
 557                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 558                                 return
 559                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 560                         if mobj is None:
 561                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 562                                 return
 563                         mediaURL = mobj.group(1).replace('\\/', '/')
 564                         video_extension = mediaURL[-3:]
 565                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 566
 567                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 568                 if mobj is None:
 569                         self._downloader.trouble(u'ERROR: unable to extract title')
 570                         return
 571                 video_title = mobj.group(1).decode('utf-8')
 572                 video_title = sanitize_title(video_title)
 573
 574                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 575                 if mobj is None:
 576                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 577                         return
 578                 video_uploader = mobj.group(1)
 579
 580                 return [{
 581                         'id':           video_id.decode('utf-8'),
 582                         'url':          video_url.decode('utf-8'),
 583                         'uploader':     video_uploader.decode('utf-8'),
 584                         'upload_date':  u'NA',
 585                         'title':        video_title,
 586                         'stitle':       simple_title,
 587                         'ext':          video_extension.decode('utf-8'),
 588                         'format':       u'NA',
 589                         'player_url':   None,
 590                 }]
 591
 592
 593 class DailymotionIE(InfoExtractor):
 594         """Information Extractor for Dailymotion"""
 595
 596         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
 597         IE_NAME = u'dailymotion'
 598
 599         def __init__(self, downloader=None):
 600                 InfoExtractor.__init__(self, downloader)
 601
 602         def report_download_webpage(self, video_id):
 603                 """Report webpage download."""
 604                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 605
 606         def report_extraction(self, video_id):
 607                 """Report information extraction."""
 608                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 609
 610         def _real_extract(self, url):
 611                 # Extract id and simplified title from URL
 612                 mobj = re.match(self._VALID_URL, url)
 613                 if mobj is None:
 614                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 615                         return
 616
 617                 video_id = mobj.group(1)
 618
 619                 video_extension = 'flv'
 620
 621                 # Retrieve video webpage to extract further information
 622                 request = urllib2.Request(url)
 623                 request.add_header('Cookie', 'family_filter=off')
 624                 try:
 625                         self.report_download_webpage(video_id)
 626                         webpage = urllib2.urlopen(request).read()
 627                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 628                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 629                         return
 630
 631                 # Extract URL, uploader and title from webpage
 632                 self.report_extraction(video_id)
 633                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
 634                 if mobj is None:
 635                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 636                         return
 637                 sequence = urllib.unquote(mobj.group(1))
 638                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
 639                 if mobj is None:
 640                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 641                         return
 642                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
 643
 644                 # if needed add http://www.dailymotion.com/ if relative URL
 645
 646                 video_url = mediaURL
 647
 648                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 649                 if mobj is None:
 650                         self._downloader.trouble(u'ERROR: unable to extract title')
 651                         return
 652                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 653                 video_title = sanitize_title(video_title)
 654                 simple_title = simplify_title(video_title)
 655
 656                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 657                 if mobj is None:
 658                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 659                         return
 660                 video_uploader = mobj.group(1)
 661
 662                 return [{
 663                         'id':           video_id.decode('utf-8'),
 664                         'url':          video_url.decode('utf-8'),
 665                         'uploader':     video_uploader.decode('utf-8'),
 666                         'upload_date':  u'NA',
 667                         'title':        video_title,
 668                         'stitle':       simple_title,
 669                         'ext':          video_extension.decode('utf-8'),
 670                         'format':       u'NA',
 671                         'player_url':   None,
 672                 }]
 673
 674
 675 class GoogleIE(InfoExtractor):
 676         """Information extractor for video.google.com."""
 677
 678         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 679         IE_NAME = u'video.google'
 680
 681         def __init__(self, downloader=None):
 682                 InfoExtractor.__init__(self, downloader)
 683
 684         def report_download_webpage(self, video_id):
 685                 """Report webpage download."""
 686                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 687
 688         def report_extraction(self, video_id):
 689                 """Report information extraction."""
 690                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 691
 692         def _real_extract(self, url):
 693                 # Extract id from URL
 694                 mobj = re.match(self._VALID_URL, url)
 695                 if mobj is None:
 696                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 697                         return
 698
 699                 video_id = mobj.group(1)
 700
 701                 video_extension = 'mp4'
 702
 703                 # Retrieve video webpage to extract further information
 704                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 705                 try:
 706                         self.report_download_webpage(video_id)
 707                         webpage = urllib2.urlopen(request).read()
 708                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 709                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 710                         return
 711
 712                 # Extract URL, uploader, and title from webpage
 713                 self.report_extraction(video_id)
 714                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 715                 if mobj is None:
 716                         video_extension = 'flv'
 717                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 718                 if mobj is None:
 719                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 720                         return
 721                 mediaURL = urllib.unquote(mobj.group(1))
 722                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 723                 mediaURL = mediaURL.replace('\\x26', '\x26')
 724
 725                 video_url = mediaURL
 726
 727                 mobj = re.search(r'<title>(.*)</title>', webpage)
 728                 if mobj is None:
 729                         self._downloader.trouble(u'ERROR: unable to extract title')
 730                         return
 731                 video_title = mobj.group(1).decode('utf-8')
 732                 video_title = sanitize_title(video_title)
 733                 simple_title = simplify_title(video_title)
 734
 735                 # Extract video description
 736                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 737                 if mobj is None:
 738                         self._downloader.trouble(u'ERROR: unable to extract video description')
 739                         return
 740                 video_description = mobj.group(1).decode('utf-8')
 741                 if not video_description:
 742                         video_description = 'No description available.'
 743
 744                 # Extract video thumbnail
 745                 if self._downloader.params.get('forcethumbnail', False):
 746                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 747                         try:
 748                                 webpage = urllib2.urlopen(request).read()
 749                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 750                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 751                                 return
 752                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 753                         if mobj is None:
 754                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 755                                 return
 756                         video_thumbnail = mobj.group(1)
 757                 else:   # we need something to pass to process_info
 758                         video_thumbnail = ''
 759
 760                 return [{
 761                         'id':           video_id.decode('utf-8'),
 762                         'url':          video_url.decode('utf-8'),
 763                         'uploader':     u'NA',
 764                         'upload_date':  u'NA',
 765                         'title':        video_title,
 766                         'stitle':       simple_title,
 767                         'ext':          video_extension.decode('utf-8'),
 768                         'format':       u'NA',
 769                         'player_url':   None,
 770                 }]
 771
 772
 773 class PhotobucketIE(InfoExtractor):
 774         """Information extractor for photobucket.com."""
 775
 776         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 777         IE_NAME = u'photobucket'
 778
 779         def __init__(self, downloader=None):
 780                 InfoExtractor.__init__(self, downloader)
 781
 782         def report_download_webpage(self, video_id):
 783                 """Report webpage download."""
 784                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 785
 786         def report_extraction(self, video_id):
 787                 """Report information extraction."""
 788                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 789
 790         def _real_extract(self, url):
 791                 # Extract id from URL
 792                 mobj = re.match(self._VALID_URL, url)
 793                 if mobj is None:
 794                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 795                         return
 796
 797                 video_id = mobj.group(1)
 798
 799                 video_extension = 'flv'
 800
 801                 # Retrieve video webpage to extract further information
 802                 request = urllib2.Request(url)
 803                 try:
 804                         self.report_download_webpage(video_id)
 805                         webpage = urllib2.urlopen(request).read()
 806                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 807                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 808                         return
 809
 810                 # Extract URL, uploader, and title from webpage
 811                 self.report_extraction(video_id)
 812                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 813                 if mobj is None:
 814                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 815                         return
 816                 mediaURL = urllib.unquote(mobj.group(1))
 817
 818                 video_url = mediaURL
 819
 820                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 821                 if mobj is None:
 822                         self._downloader.trouble(u'ERROR: unable to extract title')
 823                         return
 824                 video_title = mobj.group(1).decode('utf-8')
 825                 video_title = sanitize_title(video_title)
 826                 simple_title = simplify_title(video_title)
 827
 828                 video_uploader = mobj.group(2).decode('utf-8')
 829
 830                 return [{
 831                         'id':           video_id.decode('utf-8'),
 832                         'url':          video_url.decode('utf-8'),
 833                         'uploader':     video_uploader,
 834                         'upload_date':  u'NA',
 835                         'title':        video_title,
 836                         'stitle':       simple_title,
 837                         'ext':          video_extension.decode('utf-8'),
 838                         'format':       u'NA',
 839                         'player_url':   None,
 840                 }]
 841
 842
 843 class YahooIE(InfoExtractor):
 844         """Information extractor for video.yahoo.com."""
 845
 846         # _VALID_URL matches all Yahoo! Video URLs
 847         # _VPAGE_URL matches only the extractable '/watch/' URLs
 848         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 849         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 850         IE_NAME = u'video.yahoo'
 851
 852         def __init__(self, downloader=None):
 853                 InfoExtractor.__init__(self, downloader)
 854
 855         def report_download_webpage(self, video_id):
 856                 """Report webpage download."""
 857                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 858
 859         def report_extraction(self, video_id):
 860                 """Report information extraction."""
 861                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 862
 863         def _real_extract(self, url, new_video=True):
 864                 # Extract ID from URL
 865                 mobj = re.match(self._VALID_URL, url)
 866                 if mobj is None:
 867                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 868                         return
 869
 870                 video_id = mobj.group(2)
 871                 video_extension = 'flv'
 872
 873                 # Rewrite valid but non-extractable URLs as
 874                 # extractable English language /watch/ URLs
 875                 if re.match(self._VPAGE_URL, url) is None:
 876                         request = urllib2.Request(url)
 877                         try:
 878                                 webpage = urllib2.urlopen(request).read()
 879                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 880                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 881                                 return
 882
 883                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 884                         if mobj is None:
 885                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 886                                 return
 887                         yahoo_id = mobj.group(1)
 888
 889                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 890                         if mobj is None:
 891                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 892                                 return
 893                         yahoo_vid = mobj.group(1)
 894
 895                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 896                         return self._real_extract(url, new_video=False)
 897
 898                 # Retrieve video webpage to extract further information
 899                 request = urllib2.Request(url)
 900                 try:
 901                         self.report_download_webpage(video_id)
 902                         webpage = urllib2.urlopen(request).read()
 903                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 904                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 905                         return
 906
 907                 # Extract uploader and title from webpage
 908                 self.report_extraction(video_id)
 909                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 910                 if mobj is None:
 911                         self._downloader.trouble(u'ERROR: unable to extract video title')
 912                         return
 913                 video_title = mobj.group(1).decode('utf-8')
 914                 simple_title = simplify_title(video_title)
 915
 916                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 917                 if mobj is None:
 918                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 919                         return
 920                 video_uploader = mobj.group(1).decode('utf-8')
 921
 922                 # Extract video thumbnail
 923                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 924                 if mobj is None:
 925                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 926                         return
 927                 video_thumbnail = mobj.group(1).decode('utf-8')
 928
 929                 # Extract video description
 930                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 931                 if mobj is None:
 932                         self._downloader.trouble(u'ERROR: unable to extract video description')
 933                         return
 934                 video_description = mobj.group(1).decode('utf-8')
 935                 if not video_description:
 936                         video_description = 'No description available.'
 937
 938                 # Extract video height and width
 939                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 940                 if mobj is None:
 941                         self._downloader.trouble(u'ERROR: unable to extract video height')
 942                         return
 943                 yv_video_height = mobj.group(1)
 944
 945                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 946                 if mobj is None:
 947                         self._downloader.trouble(u'ERROR: unable to extract video width')
 948                         return
 949                 yv_video_width = mobj.group(1)
 950
 951                 # Retrieve video playlist to extract media URL
 952                 # I'm not completely sure what all these options are, but we
 953                 # seem to need most of them, otherwise the server sends a 401.
 954                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 955                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 956                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 957                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 958                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 959                 try:
 960                         self.report_download_webpage(video_id)
 961                         webpage = urllib2.urlopen(request).read()
 962                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 963                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 964                         return
 965
 966                 # Extract media URL from playlist XML
 967                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 968                 if mobj is None:
 969                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 970                         return
 971                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 972                 video_url = unescapeHTML(video_url)
 973
 974                 return [{
 975                         'id':           video_id.decode('utf-8'),
 976                         'url':          video_url,
 977                         'uploader':     video_uploader,
 978                         'upload_date':  u'NA',
 979                         'title':        video_title,
 980                         'stitle':       simple_title,
 981                         'ext':          video_extension.decode('utf-8'),
 982                         'thumbnail':    video_thumbnail.decode('utf-8'),
 983                         'description':  video_description,
 984                         'thumbnail':    video_thumbnail,
 985                         'player_url':   None,
 986                 }]
 987
 988
 989 class VimeoIE(InfoExtractor):
 990         """Information extractor for vimeo.com."""
 991
 992         # _VALID_URL matches Vimeo URLs
 993         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
 994         IE_NAME = u'vimeo'
 995
 996         def __init__(self, downloader=None):
 997                 InfoExtractor.__init__(self, downloader)
 998
 999         def report_download_webpage(self, video_id):
1000                 """Report webpage download."""
1001                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1002
1003         def report_extraction(self, video_id):
1004                 """Report information extraction."""
1005                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1006
1007         def _real_extract(self, url, new_video=True):
1008                 # Extract ID from URL
1009                 mobj = re.match(self._VALID_URL, url)
1010                 if mobj is None:
1011                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1012                         return
1013
1014                 video_id = mobj.group(1)
1015
1016                 # Retrieve video webpage to extract further information
1017                 request = urllib2.Request(url, None, std_headers)
1018                 try:
1019                         self.report_download_webpage(video_id)
1020                         webpage = urllib2.urlopen(request).read()
1021                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1022                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1023                         return
1024
1025                 # Now we begin extracting as much information as we can from what we
1026                 # retrieved. First we extract the information common to all extractors,
1027                 # and latter we extract those that are Vimeo specific.
1028                 self.report_extraction(video_id)
1029
1030                 # Extract the config JSON
1031                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1032                 try:
1033                         config = json.loads(config)
1034                 except:
1035                         self._downloader.trouble(u'ERROR: unable to extract info section')
1036                         return
1037
1038                 # Extract title
1039                 video_title = config["video"]["title"]
1040                 simple_title = simplify_title(video_title)
1041
1042                 # Extract uploader
1043                 video_uploader = config["video"]["owner"]["name"]
1044
1045                 # Extract video thumbnail
1046                 video_thumbnail = config["video"]["thumbnail"]
1047
1048                 # Extract video description
1049                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1050                 if video_description: video_description = clean_html(video_description)
1051                 else: video_description = ''
1052
1053                 # Extract upload date
1054                 video_upload_date = u'NA'
1055                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1056                 if mobj is not None:
1057                         video_upload_date = mobj.group(1)
1058
1059                 # Vimeo specific: extract request signature and timestamp
1060                 sig = config['request']['signature']
1061                 timestamp = config['request']['timestamp']
1062
1063                 # Vimeo specific: extract video codec and quality information
1064                 # TODO bind to format param
1065                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1066                 for codec in codecs:
1067                         if codec[0] in config["video"]["files"]:
1068                                 video_codec = codec[0]
1069                                 video_extension = codec[1]
1070                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1071                                 else: quality = 'sd'
1072                                 break
1073                 else:
1074                         self._downloader.trouble(u'ERROR: no known codec found')
1075                         return
1076
1077                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1078                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1079
1080                 return [{
1081                         'id':           video_id,
1082                         'url':          video_url,
1083                         'uploader':     video_uploader,
1084                         'upload_date':  video_upload_date,
1085                         'title':        video_title,
1086                         'stitle':       simple_title,
1087                         'ext':          video_extension,
1088                         'thumbnail':    video_thumbnail,
1089                         'description':  video_description,
1090                         'player_url':   None,
1091                 }]
1092
1093
1094 class GenericIE(InfoExtractor):
1095         """Generic last-resort information extractor."""
1096
1097         _VALID_URL = r'.*'
1098         IE_NAME = u'generic'
1099
1100         def __init__(self, downloader=None):
1101                 InfoExtractor.__init__(self, downloader)
1102
1103         def report_download_webpage(self, video_id):
1104                 """Report webpage download."""
1105                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1106                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1107
1108         def report_extraction(self, video_id):
1109                 """Report information extraction."""
1110                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1111
1112         def report_following_redirect(self, new_url):
1113                 """Report information extraction."""
1114                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1115
1116         def _test_redirect(self, url):
1117                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1118                 class HeadRequest(urllib2.Request):
1119                         def get_method(self):
1120                                 return "HEAD"
1121
1122                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1123                         """
1124                         Subclass the HTTPRedirectHandler to make it use our
1125                         HeadRequest also on the redirected URL
1126                         """
1127                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1128                                 if code in (301, 302, 303, 307):
1129                                         newurl = newurl.replace(' ', '%20')
1130                                         newheaders = dict((k,v) for k,v in req.headers.items()
1131                                                                           if k.lower() not in ("content-length", "content-type"))
1132                                         return HeadRequest(newurl,
1133                                                                            headers=newheaders,
1134                                                                            origin_req_host=req.get_origin_req_host(),
1135                                                                            unverifiable=True)
1136                                 else:
1137                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1138
1139                 class HTTPMethodFallback(urllib2.BaseHandler):
1140                         """
1141                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1142                         """
1143                         def http_error_405(self, req, fp, code, msg, headers):
1144                                 fp.read()
1145                                 fp.close()
1146
1147                                 newheaders = dict((k,v) for k,v in req.headers.items()
1148                                                                   if k.lower() not in ("content-length", "content-type"))
1149                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1150                                                                                                  headers=newheaders,
1151                                                                                                  origin_req_host=req.get_origin_req_host(),
1152                                                                                                  unverifiable=True))
1153
1154                 # Build our opener
1155                 opener = urllib2.OpenerDirector()
1156                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1157                                                 HTTPMethodFallback, HEADRedirectHandler,
1158                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1159                         opener.add_handler(handler())
1160
1161                 response = opener.open(HeadRequest(url))
1162                 new_url = response.geturl()
1163
1164                 if url == new_url: return False
1165
1166                 self.report_following_redirect(new_url)
1167                 self._downloader.download([new_url])
1168                 return True
1169
1170         def _real_extract(self, url):
1171                 if self._test_redirect(url): return
1172
1173                 video_id = url.split('/')[-1]
1174                 request = urllib2.Request(url)
1175                 try:
1176                         self.report_download_webpage(video_id)
1177                         webpage = urllib2.urlopen(request).read()
1178                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1179                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1180                         return
1181                 except ValueError, err:
1182                         # since this is the last-resort InfoExtractor, if
1183                         # this error is thrown, it'll be thrown here
1184                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1185                         return
1186
1187                 self.report_extraction(video_id)
1188                 # Start with something easy: JW Player in SWFObject
1189                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1190                 if mobj is None:
1191                         # Broaden the search a little bit
1192                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1193                 if mobj is None:
1194                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1195                         return
1196
1197                 # It's possible that one of the regexes
1198                 # matched, but returned an empty group:
1199                 if mobj.group(1) is None:
1200                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1201                         return
1202
1203                 video_url = urllib.unquote(mobj.group(1))
1204                 video_id = os.path.basename(video_url)
1205
1206                 # here's a fun little line of code for you:
1207                 video_extension = os.path.splitext(video_id)[1][1:]
1208                 video_id = os.path.splitext(video_id)[0]
1209
1210                 # it's tempting to parse this further, but you would
1211                 # have to take into account all the variations like
1212                 #   Video Title - Site Name
1213                 #   Site Name | Video Title
1214                 #   Video Title - Tagline | Site Name
1215                 # and so on and so forth; it's just not practical
1216                 mobj = re.search(r'<title>(.*)</title>', webpage)
1217                 if mobj is None:
1218                         self._downloader.trouble(u'ERROR: unable to extract title')
1219                         return
1220                 video_title = mobj.group(1).decode('utf-8')
1221                 video_title = sanitize_title(video_title)
1222                 simple_title = simplify_title(video_title)
1223
1224                 # video uploader is domain name
1225                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1226                 if mobj is None:
1227                         self._downloader.trouble(u'ERROR: unable to extract title')
1228                         return
1229                 video_uploader = mobj.group(1).decode('utf-8')
1230
1231                 return [{
1232                         'id':           video_id.decode('utf-8'),
1233                         'url':          video_url.decode('utf-8'),
1234                         'uploader':     video_uploader,
1235                         'upload_date':  u'NA',
1236                         'title':        video_title,
1237                         'stitle':       simple_title,
1238                         'ext':          video_extension.decode('utf-8'),
1239                         'format':       u'NA',
1240                         'player_url':   None,
1241                 }]
1242
1243
1244 class YoutubeSearchIE(InfoExtractor):
1245         """Information Extractor for YouTube search queries."""
1246         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1247         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1248         _max_youtube_results = 1000
1249         IE_NAME = u'youtube:search'
1250
1251         def __init__(self, downloader=None):
1252                 InfoExtractor.__init__(self, downloader)
1253
1254         def report_download_page(self, query, pagenum):
1255                 """Report attempt to download playlist page with given number."""
1256                 query = query.decode(preferredencoding())
1257                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1258
1259         def _real_extract(self, query):
1260                 mobj = re.match(self._VALID_URL, query)
1261                 if mobj is None:
1262                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1263                         return
1264
1265                 prefix, query = query.split(':')
1266                 prefix = prefix[8:]
1267                 query = query.encode('utf-8')
1268                 if prefix == '':
1269                         self._download_n_results(query, 1)
1270                         return
1271                 elif prefix == 'all':
1272                         self._download_n_results(query, self._max_youtube_results)
1273                         return
1274                 else:
1275                         try:
1276                                 n = long(prefix)
1277                                 if n <= 0:
1278                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1279                                         return
1280                                 elif n > self._max_youtube_results:
1281                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1282                                         n = self._max_youtube_results
1283                                 self._download_n_results(query, n)
1284                                 return
1285                         except ValueError: # parsing prefix as integer fails
1286                                 self._download_n_results(query, 1)
1287                                 return
1288
1289         def _download_n_results(self, query, n):
1290                 """Downloads a specified number of results for a query"""
1291
1292                 video_ids = []
1293                 pagenum = 0
1294                 limit = n
1295
1296                 while (50 * pagenum) < limit:
1297                         self.report_download_page(query, pagenum+1)
1298                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1299                         request = urllib2.Request(result_url)
1300                         try:
1301                                 data = urllib2.urlopen(request).read()
1302                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1303                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1304                                 return
1305                         api_response = json.loads(data)['data']
1306
1307                         new_ids = list(video['id'] for video in api_response['items'])
1308                         video_ids += new_ids
1309
1310                         limit = min(n, api_response['totalItems'])
1311                         pagenum += 1
1312
1313                 if len(video_ids) > n:
1314                         video_ids = video_ids[:n]
1315                 for id in video_ids:
1316                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1317                 return
1318
1319
1320 class GoogleSearchIE(InfoExtractor):
1321         """Information Extractor for Google Video search queries."""
1322         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1323         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1324         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1325         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1326         _max_google_results = 1000
1327         IE_NAME = u'video.google:search'
1328
1329         def __init__(self, downloader=None):
1330                 InfoExtractor.__init__(self, downloader)
1331
1332         def report_download_page(self, query, pagenum):
1333                 """Report attempt to download playlist page with given number."""
1334                 query = query.decode(preferredencoding())
1335                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1336
1337         def _real_extract(self, query):
1338                 mobj = re.match(self._VALID_URL, query)
1339                 if mobj is None:
1340                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1341                         return
1342
1343                 prefix, query = query.split(':')
1344                 prefix = prefix[8:]
1345                 query = query.encode('utf-8')
1346                 if prefix == '':
1347                         self._download_n_results(query, 1)
1348                         return
1349                 elif prefix == 'all':
1350                         self._download_n_results(query, self._max_google_results)
1351                         return
1352                 else:
1353                         try:
1354                                 n = long(prefix)
1355                                 if n <= 0:
1356                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1357                                         return
1358                                 elif n > self._max_google_results:
1359                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1360                                         n = self._max_google_results
1361                                 self._download_n_results(query, n)
1362                                 return
1363                         except ValueError: # parsing prefix as integer fails
1364                                 self._download_n_results(query, 1)
1365                                 return
1366
1367         def _download_n_results(self, query, n):
1368                 """Downloads a specified number of results for a query"""
1369
1370                 video_ids = []
1371                 pagenum = 0
1372
1373                 while True:
1374                         self.report_download_page(query, pagenum)
1375                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1376                         request = urllib2.Request(result_url)
1377                         try:
1378                                 page = urllib2.urlopen(request).read()
1379                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1380                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1381                                 return
1382
1383                         # Extract video identifiers
1384                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1385                                 video_id = mobj.group(1)
1386                                 if video_id not in video_ids:
1387                                         video_ids.append(video_id)
1388                                         if len(video_ids) == n:
1389                                                 # Specified n videos reached
1390                                                 for id in video_ids:
1391                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1392                                                 return
1393
1394                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1395                                 for id in video_ids:
1396                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1397                                 return
1398
1399                         pagenum = pagenum + 1
1400
1401
1402 class YahooSearchIE(InfoExtractor):
1403         """Information Extractor for Yahoo! Video search queries."""
1404         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1405         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1406         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1407         _MORE_PAGES_INDICATOR = r'\s*Next'
1408         _max_yahoo_results = 1000
1409         IE_NAME = u'video.yahoo:search'
1410
1411         def __init__(self, downloader=None):
1412                 InfoExtractor.__init__(self, downloader)
1413
1414         def report_download_page(self, query, pagenum):
1415                 """Report attempt to download playlist page with given number."""
1416                 query = query.decode(preferredencoding())
1417                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1418
1419         def _real_extract(self, query):
1420                 mobj = re.match(self._VALID_URL, query)
1421                 if mobj is None:
1422                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1423                         return
1424
1425                 prefix, query = query.split(':')
1426                 prefix = prefix[8:]
1427                 query = query.encode('utf-8')
1428                 if prefix == '':
1429                         self._download_n_results(query, 1)
1430                         return
1431                 elif prefix == 'all':
1432                         self._download_n_results(query, self._max_yahoo_results)
1433                         return
1434                 else:
1435                         try:
1436                                 n = long(prefix)
1437                                 if n <= 0:
1438                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1439                                         return
1440                                 elif n > self._max_yahoo_results:
1441                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1442                                         n = self._max_yahoo_results
1443                                 self._download_n_results(query, n)
1444                                 return
1445                         except ValueError: # parsing prefix as integer fails
1446                                 self._download_n_results(query, 1)
1447                                 return
1448
1449         def _download_n_results(self, query, n):
1450                 """Downloads a specified number of results for a query"""
1451
1452                 video_ids = []
1453                 already_seen = set()
1454                 pagenum = 1
1455
1456                 while True:
1457                         self.report_download_page(query, pagenum)
1458                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1459                         request = urllib2.Request(result_url)
1460                         try:
1461                                 page = urllib2.urlopen(request).read()
1462                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1463                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1464                                 return
1465
1466                         # Extract video identifiers
1467                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1468                                 video_id = mobj.group(1)
1469                                 if video_id not in already_seen:
1470                                         video_ids.append(video_id)
1471                                         already_seen.add(video_id)
1472                                         if len(video_ids) == n:
1473                                                 # Specified n videos reached
1474                                                 for id in video_ids:
1475                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1476                                                 return
1477
1478                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1479                                 for id in video_ids:
1480                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1481                                 return
1482
1483                         pagenum = pagenum + 1
1484
1485
1486 class YoutubePlaylistIE(InfoExtractor):
1487         """Information Extractor for YouTube playlists."""
1488
1489         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1490         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1491         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
1492         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1493         IE_NAME = u'youtube:playlist'
1494
1495         def __init__(self, downloader=None):
1496                 InfoExtractor.__init__(self, downloader)
1497
1498         def report_download_page(self, playlist_id, pagenum):
1499                 """Report attempt to download playlist page with given number."""
1500                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1501
1502         def _real_extract(self, url):
1503                 # Extract playlist id
1504                 mobj = re.match(self._VALID_URL, url)
1505                 if mobj is None:
1506                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1507                         return
1508
1509                 # Single video case
1510                 if mobj.group(3) is not None:
1511                         self._downloader.download([mobj.group(3)])
1512                         return
1513
1514                 # Download playlist pages
1515                 # prefix is 'p' as default for playlists but there are other types that need extra care
1516                 playlist_prefix = mobj.group(1)
1517                 if playlist_prefix == 'a':
1518                         playlist_access = 'artist'
1519                 else:
1520                         playlist_prefix = 'p'
1521                         playlist_access = 'view_play_list'
1522                 playlist_id = mobj.group(2)
1523                 video_ids = []
1524                 pagenum = 1
1525
1526                 while True:
1527                         self.report_download_page(playlist_id, pagenum)
1528                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1529                         request = urllib2.Request(url)
1530                         try:
1531                                 page = urllib2.urlopen(request).read()
1532                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1533                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1534                                 return
1535
1536                         # Extract video identifiers
1537                         ids_in_page = []
1538                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1539                                 if mobj.group(1) not in ids_in_page:
1540                                         ids_in_page.append(mobj.group(1))
1541                         video_ids.extend(ids_in_page)
1542
1543                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1544                                 break
1545                         pagenum = pagenum + 1
1546
1547                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1548                 playlistend = self._downloader.params.get('playlistend', -1)
1549                 if playlistend == -1:
1550                         video_ids = video_ids[playliststart:]
1551                 else:
1552                         video_ids = video_ids[playliststart:playlistend]
1553
1554                 for id in video_ids:
1555                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1556                 return
1557
1558
1559 class YoutubeUserIE(InfoExtractor):
1560         """Information Extractor for YouTube users."""
1561
1562         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1563         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1564         _GDATA_PAGE_SIZE = 50
1565         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1566         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1567         IE_NAME = u'youtube:user'
1568
1569         def __init__(self, downloader=None):
1570                 InfoExtractor.__init__(self, downloader)
1571
1572         def report_download_page(self, username, start_index):
1573                 """Report attempt to download user page."""
1574                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1575                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1576
1577         def _real_extract(self, url):
1578                 # Extract username
1579                 mobj = re.match(self._VALID_URL, url)
1580                 if mobj is None:
1581                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1582                         return
1583
1584                 username = mobj.group(1)
1585
1586                 # Download video ids using YouTube Data API. Result size per
1587                 # query is limited (currently to 50 videos) so we need to query
1588                 # page by page until there are no video ids - it means we got
1589                 # all of them.
1590
1591                 video_ids = []
1592                 pagenum = 0
1593
1594                 while True:
1595                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1596                         self.report_download_page(username, start_index)
1597
1598                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1599
1600                         try:
1601                                 page = urllib2.urlopen(request).read()
1602                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1603                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1604                                 return
1605
1606                         # Extract video identifiers
1607                         ids_in_page = []
1608
1609                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1610                                 if mobj.group(1) not in ids_in_page:
1611                                         ids_in_page.append(mobj.group(1))
1612
1613                         video_ids.extend(ids_in_page)
1614
1615                         # A little optimization - if current page is not
1616                         # "full", ie. does not contain PAGE_SIZE video ids then
1617                         # we can assume that this page is the last one - there
1618                         # are no more ids on further pages - no need to query
1619                         # again.
1620
1621                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1622                                 break
1623
1624                         pagenum += 1
1625
1626                 all_ids_count = len(video_ids)
1627                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1628                 playlistend = self._downloader.params.get('playlistend', -1)
1629
1630                 if playlistend == -1:
1631                         video_ids = video_ids[playliststart:]
1632                 else:
1633                         video_ids = video_ids[playliststart:playlistend]
1634
1635                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1636                                 (username, all_ids_count, len(video_ids)))
1637
1638                 for video_id in video_ids:
1639                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1640
1641
1642 class DepositFilesIE(InfoExtractor):
1643         """Information extractor for depositfiles.com"""
1644
1645         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1646         IE_NAME = u'DepositFiles'
1647
1648         def __init__(self, downloader=None):
1649                 InfoExtractor.__init__(self, downloader)
1650
1651         def report_download_webpage(self, file_id):
1652                 """Report webpage download."""
1653                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1654
1655         def report_extraction(self, file_id):
1656                 """Report information extraction."""
1657                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1658
1659         def _real_extract(self, url):
1660                 file_id = url.split('/')[-1]
1661                 # Rebuild url in english locale
1662                 url = 'http://depositfiles.com/en/files/' + file_id
1663
1664                 # Retrieve file webpage with 'Free download' button pressed
1665                 free_download_indication = { 'gateway_result' : '1' }
1666                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1667                 try:
1668                         self.report_download_webpage(file_id)
1669                         webpage = urllib2.urlopen(request).read()
1670                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1671                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1672                         return
1673
1674                 # Search for the real file URL
1675                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1676                 if (mobj is None) or (mobj.group(1) is None):
1677                         # Try to figure out reason of the error.
1678                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1679                         if (mobj is not None) and (mobj.group(1) is not None):
1680                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1681                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1682                         else:
1683                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1684                         return
1685
1686                 file_url = mobj.group(1)
1687                 file_extension = os.path.splitext(file_url)[1][1:]
1688
1689                 # Search for file title
1690                 mobj = re.search(r'<b title="(.*?)">', webpage)
1691                 if mobj is None:
1692                         self._downloader.trouble(u'ERROR: unable to extract title')
1693                         return
1694                 file_title = mobj.group(1).decode('utf-8')
1695
1696                 return [{
1697                         'id':           file_id.decode('utf-8'),
1698                         'url':          file_url.decode('utf-8'),
1699                         'uploader':     u'NA',
1700                         'upload_date':  u'NA',
1701                         'title':        file_title,
1702                         'stitle':       file_title,
1703                         'ext':          file_extension.decode('utf-8'),
1704                         'format':       u'NA',
1705                         'player_url':   None,
1706                 }]
1707
1708
1709 class FacebookIE(InfoExtractor):
1710         """Information Extractor for Facebook"""
1711
1712         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1713         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1714         _NETRC_MACHINE = 'facebook'
1715         _available_formats = ['video', 'highqual', 'lowqual']
1716         _video_extensions = {
1717                 'video': 'mp4',
1718                 'highqual': 'mp4',
1719                 'lowqual': 'mp4',
1720         }
1721         IE_NAME = u'facebook'
1722
1723         def __init__(self, downloader=None):
1724                 InfoExtractor.__init__(self, downloader)
1725
1726         def _reporter(self, message):
1727                 """Add header and report message."""
1728                 self._downloader.to_screen(u'[facebook] %s' % message)
1729
1730         def report_login(self):
1731                 """Report attempt to log in."""
1732                 self._reporter(u'Logging in')
1733
1734         def report_video_webpage_download(self, video_id):
1735                 """Report attempt to download video webpage."""
1736                 self._reporter(u'%s: Downloading video webpage' % video_id)
1737
1738         def report_information_extraction(self, video_id):
1739                 """Report attempt to extract video information."""
1740                 self._reporter(u'%s: Extracting video information' % video_id)
1741
1742         def _parse_page(self, video_webpage):
1743                 """Extract video information from page"""
1744                 # General data
1745                 data = {'title': r'\("video_title", "(.*?)"\)',
1746                         'description': r'<div class="datawrap">(.*?)</div>',
1747                         'owner': r'\("video_owner_name", "(.*?)"\)',
1748                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1749                         }
1750                 video_info = {}
1751                 for piece in data.keys():
1752                         mobj = re.search(data[piece], video_webpage)
1753                         if mobj is not None:
1754                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1755
1756                 # Video urls
1757                 video_urls = {}
1758                 for fmt in self._available_formats:
1759                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1760                         if mobj is not None:
1761                                 # URL is in a Javascript segment inside an escaped Unicode format within
1762                                 # the generally utf-8 page
1763                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1764                 video_info['video_urls'] = video_urls
1765
1766                 return video_info
1767
1768         def _real_initialize(self):
1769                 if self._downloader is None:
1770                         return
1771
1772                 useremail = None
1773                 password = None
1774                 downloader_params = self._downloader.params
1775
1776                 # Attempt to use provided username and password or .netrc data
1777                 if downloader_params.get('username', None) is not None:
1778                         useremail = downloader_params['username']
1779                         password = downloader_params['password']
1780                 elif downloader_params.get('usenetrc', False):
1781                         try:
1782                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1783                                 if info is not None:
1784                                         useremail = info[0]
1785                                         password = info[2]
1786                                 else:
1787                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1788                         except (IOError, netrc.NetrcParseError), err:
1789                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1790                                 return
1791
1792                 if useremail is None:
1793                         return
1794
1795                 # Log in
1796                 login_form = {
1797                         'email': useremail,
1798                         'pass': password,
1799                         'login': 'Log+In'
1800                         }
1801                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1802                 try:
1803                         self.report_login()
1804                         login_results = urllib2.urlopen(request).read()
1805                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1806                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1807                                 return
1808                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1809                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1810                         return
1811
1812         def _real_extract(self, url):
1813                 mobj = re.match(self._VALID_URL, url)
1814                 if mobj is None:
1815                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1816                         return
1817                 video_id = mobj.group('ID')
1818
1819                 # Get video webpage
1820                 self.report_video_webpage_download(video_id)
1821                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1822                 try:
1823                         page = urllib2.urlopen(request)
1824                         video_webpage = page.read()
1825                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1826                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1827                         return
1828
1829                 # Start extracting information
1830                 self.report_information_extraction(video_id)
1831
1832                 # Extract information
1833                 video_info = self._parse_page(video_webpage)
1834
1835                 # uploader
1836                 if 'owner' not in video_info:
1837                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1838                         return
1839                 video_uploader = video_info['owner']
1840
1841                 # title
1842                 if 'title' not in video_info:
1843                         self._downloader.trouble(u'ERROR: unable to extract video title')
1844                         return
1845                 video_title = video_info['title']
1846                 video_title = video_title.decode('utf-8')
1847                 video_title = sanitize_title(video_title)
1848
1849                 simple_title = simplify_title(video_title)
1850
1851                 # thumbnail image
1852                 if 'thumbnail' not in video_info:
1853                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1854                         video_thumbnail = ''
1855                 else:
1856                         video_thumbnail = video_info['thumbnail']
1857
1858                 # upload date
1859                 upload_date = u'NA'
1860                 if 'upload_date' in video_info:
1861                         upload_time = video_info['upload_date']
1862                         timetuple = email.utils.parsedate_tz(upload_time)
1863                         if timetuple is not None:
1864                                 try:
1865                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1866                                 except:
1867                                         pass
1868
1869                 # description
1870                 video_description = video_info.get('description', 'No description available.')
1871
1872                 url_map = video_info['video_urls']
1873                 if len(url_map.keys()) > 0:
1874                         # Decide which formats to download
1875                         req_format = self._downloader.params.get('format', None)
1876                         format_limit = self._downloader.params.get('format_limit', None)
1877
1878                         if format_limit is not None and format_limit in self._available_formats:
1879                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1880                         else:
1881                                 format_list = self._available_formats
1882                         existing_formats = [x for x in format_list if x in url_map]
1883                         if len(existing_formats) == 0:
1884                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1885                                 return
1886                         if req_format is None:
1887                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1888                         elif req_format == 'worst':
1889                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1890                         elif req_format == '-1':
1891                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1892                         else:
1893                                 # Specific format
1894                                 if req_format not in url_map:
1895                                         self._downloader.trouble(u'ERROR: requested format not available')
1896                                         return
1897                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1898
1899                 results = []
1900                 for format_param, video_real_url in video_url_list:
1901                         # Extension
1902                         video_extension = self._video_extensions.get(format_param, 'mp4')
1903
1904                         results.append({
1905                                 'id':           video_id.decode('utf-8'),
1906                                 'url':          video_real_url.decode('utf-8'),
1907                                 'uploader':     video_uploader.decode('utf-8'),
1908                                 'upload_date':  upload_date,
1909                                 'title':        video_title,
1910                                 'stitle':       simple_title,
1911                                 'ext':          video_extension.decode('utf-8'),
1912                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1913                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1914                                 'description':  video_description.decode('utf-8'),
1915                                 'player_url':   None,
1916                         })
1917                 return results
1918
1919 class BlipTVIE(InfoExtractor):
1920         """Information extractor for blip.tv"""
1921
1922         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1923         _URL_EXT = r'^.*\.([a-z0-9]+)$'
1924         IE_NAME = u'blip.tv'
1925
1926         def report_extraction(self, file_id):
1927                 """Report information extraction."""
1928                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1929
1930         def report_direct_download(self, title):
1931                 """Report information extraction."""
1932                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1933
1934         def _real_extract(self, url):
1935                 mobj = re.match(self._VALID_URL, url)
1936                 if mobj is None:
1937                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1938                         return
1939
1940                 if '?' in url:
1941                         cchar = '&'
1942                 else:
1943                         cchar = '?'
1944                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1945                 request = urllib2.Request(json_url)
1946                 self.report_extraction(mobj.group(1))
1947                 info = None
1948                 try:
1949                         urlh = urllib2.urlopen(request)
1950                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1951                                 basename = url.split('/')[-1]
1952                                 title,ext = os.path.splitext(basename)
1953                                 title = title.decode('UTF-8')
1954                                 ext = ext.replace('.', '')
1955                                 self.report_direct_download(title)
1956                                 info = {
1957                                         'id': title,
1958                                         'url': url,
1959                                         'title': title,
1960                                         'stitle': simplify_title(title),
1961                                         'ext': ext,
1962                                         'urlhandle': urlh
1963                                 }
1964                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1965                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1966                         return
1967                 if info is None: # Regular URL
1968                         try:
1969                                 json_code = urlh.read()
1970                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1971                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1972                                 return
1973
1974                         try:
1975                                 json_data = json.loads(json_code)
1976                                 if 'Post' in json_data:
1977                                         data = json_data['Post']
1978                                 else:
1979                                         data = json_data
1980
1981                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1982                                 video_url = data['media']['url']
1983                                 umobj = re.match(self._URL_EXT, video_url)
1984                                 if umobj is None:
1985                                         raise ValueError('Can not determine filename extension')
1986                                 ext = umobj.group(1)
1987
1988                                 info = {
1989                                         'id': data['item_id'],
1990                                         'url': video_url,
1991                                         'uploader': data['display_name'],
1992                                         'upload_date': upload_date,
1993                                         'title': data['title'],
1994                                         'stitle': simplify_title(data['title']),
1995                                         'ext': ext,
1996                                         'format': data['media']['mimeType'],
1997                                         'thumbnail': data['thumbnailUrl'],
1998                                         'description': data['description'],
1999                                         'player_url': data['embedUrl']
2000                                 }
2001                         except (ValueError,KeyError), err:
2002                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2003                                 return
2004
2005                 return [info]
2006
2007
2008 class MyVideoIE(InfoExtractor):
2009         """Information Extractor for myvideo.de."""
2010
2011         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2012         IE_NAME = u'myvideo'
2013
2014         def __init__(self, downloader=None):
2015                 InfoExtractor.__init__(self, downloader)
2016
2017         def report_download_webpage(self, video_id):
2018                 """Report webpage download."""
2019                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2020
2021         def report_extraction(self, video_id):
2022                 """Report information extraction."""
2023                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2024
2025         def _real_extract(self,url):
2026                 mobj = re.match(self._VALID_URL, url)
2027                 if mobj is None:
2028                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2029                         return
2030
2031                 video_id = mobj.group(1)
2032
2033                 # Get video webpage
2034                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2035                 try:
2036                         self.report_download_webpage(video_id)
2037                         webpage = urllib2.urlopen(request).read()
2038                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2039                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2040                         return
2041
2042                 self.report_extraction(video_id)
2043                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2044                                  webpage)
2045                 if mobj is None:
2046                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2047                         return
2048                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2049
2050                 mobj = re.search('<title>([^<]+)</title>', webpage)
2051                 if mobj is None:
2052                         self._downloader.trouble(u'ERROR: unable to extract title')
2053                         return
2054
2055                 video_title = mobj.group(1)
2056                 video_title = sanitize_title(video_title)
2057
2058                 simple_title = simplify_title(video_title)
2059
2060                 return [{
2061                         'id':           video_id,
2062                         'url':          video_url,
2063                         'uploader':     u'NA',
2064                         'upload_date':  u'NA',
2065                         'title':        video_title,
2066                         'stitle':       simple_title,
2067                         'ext':          u'flv',
2068                         'format':       u'NA',
2069                         'player_url':   None,
2070                 }]
2071
2072 class ComedyCentralIE(InfoExtractor):
2073         """Information extractor for The Daily Show and Colbert Report """
2074
2075         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2076         IE_NAME = u'comedycentral'
2077
2078         def report_extraction(self, episode_id):
2079                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2080
2081         def report_config_download(self, episode_id):
2082                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2083
2084         def report_index_download(self, episode_id):
2085                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2086
2087         def report_player_url(self, episode_id):
2088                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2089
2090         def _real_extract(self, url):
2091                 mobj = re.match(self._VALID_URL, url)
2092                 if mobj is None:
2093                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2094                         return
2095
2096                 if mobj.group('shortname'):
2097                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2098                                 url = u'http://www.thedailyshow.com/full-episodes/'
2099                         else:
2100                                 url = u'http://www.colbertnation.com/full-episodes/'
2101                         mobj = re.match(self._VALID_URL, url)
2102                         assert mobj is not None
2103
2104                 dlNewest = not mobj.group('episode')
2105                 if dlNewest:
2106                         epTitle = mobj.group('showname')
2107                 else:
2108                         epTitle = mobj.group('episode')
2109
2110                 req = urllib2.Request(url)
2111                 self.report_extraction(epTitle)
2112                 try:
2113                         htmlHandle = urllib2.urlopen(req)
2114                         html = htmlHandle.read()
2115                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2116                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2117                         return
2118                 if dlNewest:
2119                         url = htmlHandle.geturl()
2120                         mobj = re.match(self._VALID_URL, url)
2121                         if mobj is None:
2122                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2123                                 return
2124                         if mobj.group('episode') == '':
2125                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2126                                 return
2127                         epTitle = mobj.group('episode')
2128
2129                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2130                 if len(mMovieParams) == 0:
2131                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2132                         return
2133
2134                 playerUrl_raw = mMovieParams[0][0]
2135                 self.report_player_url(epTitle)
2136                 try:
2137                         urlHandle = urllib2.urlopen(playerUrl_raw)
2138                         playerUrl = urlHandle.geturl()
2139                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2140                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2141                         return
2142
2143                 uri = mMovieParams[0][1]
2144                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2145                 self.report_index_download(epTitle)
2146                 try:
2147                         indexXml = urllib2.urlopen(indexUrl).read()
2148                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2149                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2150                         return
2151
2152                 results = []
2153
2154                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2155                 itemEls = idoc.findall('.//item')
2156                 for itemEl in itemEls:
2157                         mediaId = itemEl.findall('./guid')[0].text
2158                         shortMediaId = mediaId.split(':')[-1]
2159                         showId = mediaId.split(':')[-2].replace('.com', '')
2160                         officialTitle = itemEl.findall('./title')[0].text
2161                         officialDate = itemEl.findall('./pubDate')[0].text
2162
2163                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2164                                                 urllib.urlencode({'uri': mediaId}))
2165                         configReq = urllib2.Request(configUrl)
2166                         self.report_config_download(epTitle)
2167                         try:
2168                                 configXml = urllib2.urlopen(configReq).read()
2169                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2170                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2171                                 return
2172
2173                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2174                         turls = []
2175                         for rendition in cdoc.findall('.//rendition'):
2176                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2177                                 turls.append(finfo)
2178
2179                         if len(turls) == 0:
2180                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2181                                 continue
2182
2183                         # For now, just pick the highest bitrate
2184                         format,video_url = turls[-1]
2185
2186                         effTitle = showId + u'-' + epTitle
2187                         info = {
2188                                 'id': shortMediaId,
2189                                 'url': video_url,
2190                                 'uploader': showId,
2191                                 'upload_date': officialDate,
2192                                 'title': effTitle,
2193                                 'stitle': simplify_title(effTitle),
2194                                 'ext': 'mp4',
2195                                 'format': format,
2196                                 'thumbnail': None,
2197                                 'description': officialTitle,
2198                                 'player_url': playerUrl
2199                         }
2200
2201                         results.append(info)
2202
2203                 return results
2204
2205
2206 class EscapistIE(InfoExtractor):
2207         """Information extractor for The Escapist """
2208
2209         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2210         IE_NAME = u'escapist'
2211
2212         def report_extraction(self, showName):
2213                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2214
2215         def report_config_download(self, showName):
2216                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2217
2218         def _real_extract(self, url):
2219                 mobj = re.match(self._VALID_URL, url)
2220                 if mobj is None:
2221                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2222                         return
2223                 showName = mobj.group('showname')
2224                 videoId = mobj.group('episode')
2225
2226                 self.report_extraction(showName)
2227                 try:
2228                         webPage = urllib2.urlopen(url).read()
2229                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2230                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2231                         return
2232
2233                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2234                 description = unescapeHTML(descMatch.group(1))
2235                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2236                 imgUrl = unescapeHTML(imgMatch.group(1))
2237                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2238                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2239                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2240                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2241
2242                 self.report_config_download(showName)
2243                 try:
2244                         configJSON = urllib2.urlopen(configUrl).read()
2245                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2246                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2247                         return
2248
2249                 # Technically, it's JavaScript, not JSON
2250                 configJSON = configJSON.replace("'", '"')
2251
2252                 try:
2253                         config = json.loads(configJSON)
2254                 except (ValueError,), err:
2255                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2256                         return
2257
2258                 playlist = config['playlist']
2259                 videoUrl = playlist[1]['url']
2260
2261                 info = {
2262                         'id': videoId,
2263                         'url': videoUrl,
2264                         'uploader': showName,
2265                         'upload_date': None,
2266                         'title': showName,
2267                         'stitle': simplify_title(showName),
2268                         'ext': 'flv',
2269                         'format': 'flv',
2270                         'thumbnail': imgUrl,
2271                         'description': description,
2272                         'player_url': playerUrl,
2273                 }
2274
2275                 return [info]
2276
2277
2278 class CollegeHumorIE(InfoExtractor):
2279         """Information extractor for collegehumor.com"""
2280
2281         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2282         IE_NAME = u'collegehumor'
2283
2284         def report_webpage(self, video_id):
2285                 """Report information extraction."""
2286                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2287
2288         def report_extraction(self, video_id):
2289                 """Report information extraction."""
2290                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2291
2292         def _real_extract(self, url):
2293                 mobj = re.match(self._VALID_URL, url)
2294                 if mobj is None:
2295                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2296                         return
2297                 video_id = mobj.group('videoid')
2298
2299                 self.report_webpage(video_id)
2300                 request = urllib2.Request(url)
2301                 try:
2302                         webpage = urllib2.urlopen(request).read()
2303                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2304                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2305                         return
2306
2307                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2308                 if m is None:
2309                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2310                         return
2311                 internal_video_id = m.group('internalvideoid')
2312
2313                 info = {
2314                         'id': video_id,
2315                         'internal_id': internal_video_id,
2316                 }
2317
2318                 self.report_extraction(video_id)
2319                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2320                 try:
2321                         metaXml = urllib2.urlopen(xmlUrl).read()
2322                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2323                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2324                         return
2325
2326                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2327                 try:
2328                         videoNode = mdoc.findall('./video')[0]
2329                         info['description'] = videoNode.findall('./description')[0].text
2330                         info['title'] = videoNode.findall('./caption')[0].text
2331                         info['stitle'] = simplify_title(info['title'])
2332                         info['url'] = videoNode.findall('./file')[0].text
2333                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2334                         info['ext'] = info['url'].rpartition('.')[2]
2335                         info['format'] = info['ext']
2336                 except IndexError:
2337                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2338                         return
2339
2340                 return [info]
2341
2342
2343 class XVideosIE(InfoExtractor):
2344         """Information extractor for xvideos.com"""
2345
2346         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2347         IE_NAME = u'xvideos'
2348
2349         def report_webpage(self, video_id):
2350                 """Report information extraction."""
2351                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2352
2353         def report_extraction(self, video_id):
2354                 """Report information extraction."""
2355                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2356
2357         def _real_extract(self, url):
2358                 mobj = re.match(self._VALID_URL, url)
2359                 if mobj is None:
2360                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2361                         return
2362                 video_id = mobj.group(1).decode('utf-8')
2363
2364                 self.report_webpage(video_id)
2365
2366                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2367                 try:
2368                         webpage = urllib2.urlopen(request).read()
2369                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2370                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2371                         return
2372
2373                 self.report_extraction(video_id)
2374
2375
2376                 # Extract video URL
2377                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2378                 if mobj is None:
2379                         self._downloader.trouble(u'ERROR: unable to extract video url')
2380                         return
2381                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2382
2383
2384                 # Extract title
2385                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2386                 if mobj is None:
2387                         self._downloader.trouble(u'ERROR: unable to extract video title')
2388                         return
2389                 video_title = mobj.group(1).decode('utf-8')
2390
2391
2392                 # Extract video thumbnail
2393                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2394                 if mobj is None:
2395                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2396                         return
2397                 video_thumbnail = mobj.group(1).decode('utf-8')
2398
2399                 info = {
2400                         'id': video_id,
2401                         'url': video_url,
2402                         'uploader': None,
2403                         'upload_date': None,
2404                         'title': video_title,
2405                         'stitle': simplify_title(video_title),
2406                         'ext': 'flv',
2407                         'format': 'flv',
2408                         'thumbnail': video_thumbnail,
2409                         'description': None,
2410                         'player_url': None,
2411                 }
2412
2413                 return [info]
2414
2415
2416 class SoundcloudIE(InfoExtractor):
2417         """Information extractor for soundcloud.com
2418            To access the media, the uid of the song and a stream token
2419            must be extracted from the page source and the script must make
2420            a request to media.soundcloud.com/crossdomain.xml. Then
2421            the media can be grabbed by requesting from an url composed
2422            of the stream token and uid
2423          """
2424
2425         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2426         IE_NAME = u'soundcloud'
2427
2428         def __init__(self, downloader=None):
2429                 InfoExtractor.__init__(self, downloader)
2430
2431         def report_webpage(self, video_id):
2432                 """Report information extraction."""
2433                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2434
2435         def report_extraction(self, video_id):
2436                 """Report information extraction."""
2437                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2438
2439         def _real_extract(self, url):
2440                 mobj = re.match(self._VALID_URL, url)
2441                 if mobj is None:
2442                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2443                         return
2444
2445                 # extract uploader (which is in the url)
2446                 uploader = mobj.group(1).decode('utf-8')
2447                 # extract simple title (uploader + slug of song title)
2448                 slug_title =  mobj.group(2).decode('utf-8')
2449                 simple_title = uploader + '-' + slug_title
2450
2451                 self.report_webpage('%s/%s' % (uploader, slug_title))
2452
2453                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2454                 try:
2455                         webpage = urllib2.urlopen(request).read()
2456                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2457                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2458                         return
2459
2460                 self.report_extraction('%s/%s' % (uploader, slug_title))
2461
2462                 # extract uid and stream token that soundcloud hands out for access
2463                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2464                 if mobj:
2465                         video_id = mobj.group(1)
2466                         stream_token = mobj.group(2)
2467
2468                 # extract unsimplified title
2469                 mobj = re.search('"title":"(.*?)",', webpage)
2470                 if mobj:
2471                         title = mobj.group(1)
2472
2473                 # construct media url (with uid/token)
2474                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2475                 mediaURL = mediaURL % (video_id, stream_token)
2476
2477                 # description
2478                 description = u'No description available'
2479                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2480                 if mobj:
2481                         description = mobj.group(1)
2482
2483                 # upload date
2484                 upload_date = None
2485                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2486                 if mobj:
2487                         try:
2488                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2489                         except Exception, e:
2490                                 print str(e)
2491
2492                 # for soundcloud, a request to a cross domain is required for cookies
2493                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2494
2495                 return [{
2496                         'id':           video_id.decode('utf-8'),
2497                         'url':          mediaURL,
2498                         'uploader':     uploader.decode('utf-8'),
2499                         'upload_date':  upload_date,
2500                         'title':        simple_title.decode('utf-8'),
2501                         'stitle':       simple_title.decode('utf-8'),
2502                         'ext':          u'mp3',
2503                         'format':       u'NA',
2504                         'player_url':   None,
2505                         'description': description.decode('utf-8')
2506                 }]
2507
2508
2509 class InfoQIE(InfoExtractor):
2510         """Information extractor for infoq.com"""
2511
2512         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2513         IE_NAME = u'infoq'
2514
2515         def report_webpage(self, video_id):
2516                 """Report information extraction."""
2517                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2518
2519         def report_extraction(self, video_id):
2520                 """Report information extraction."""
2521                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2522
2523         def _real_extract(self, url):
2524                 mobj = re.match(self._VALID_URL, url)
2525                 if mobj is None:
2526                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2527                         return
2528
2529                 self.report_webpage(url)
2530
2531                 request = urllib2.Request(url)
2532                 try:
2533                         webpage = urllib2.urlopen(request).read()
2534                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2535                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2536                         return
2537
2538                 self.report_extraction(url)
2539
2540
2541                 # Extract video URL
2542                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2543                 if mobj is None:
2544                         self._downloader.trouble(u'ERROR: unable to extract video url')
2545                         return
2546                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2547
2548
2549                 # Extract title
2550                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2551                 if mobj is None:
2552                         self._downloader.trouble(u'ERROR: unable to extract video title')
2553                         return
2554                 video_title = mobj.group(1).decode('utf-8')
2555
2556                 # Extract description
2557                 video_description = u'No description available.'
2558                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2559                 if mobj is not None:
2560                         video_description = mobj.group(1).decode('utf-8')
2561
2562                 video_filename = video_url.split('/')[-1]
2563                 video_id, extension = video_filename.split('.')
2564
2565                 info = {
2566                         'id': video_id,
2567                         'url': video_url,
2568                         'uploader': None,
2569                         'upload_date': None,
2570                         'title': video_title,
2571                         'stitle': simplify_title(video_title),
2572                         'ext': extension,
2573                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2574                         'thumbnail': None,
2575                         'description': video_description,
2576                         'player_url': None,
2577                 }
2578
2579                 return [info]
2580
2581 class MixcloudIE(InfoExtractor):
2582         """Information extractor for www.mixcloud.com"""
2583         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2584         IE_NAME = u'mixcloud'
2585
2586         def __init__(self, downloader=None):
2587                 InfoExtractor.__init__(self, downloader)
2588
2589         def report_download_json(self, file_id):
2590                 """Report JSON download."""
2591                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2592
2593         def report_extraction(self, file_id):
2594                 """Report information extraction."""
2595                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2596
2597         def get_urls(self, jsonData, fmt, bitrate='best'):
2598                 """Get urls from 'audio_formats' section in json"""
2599                 file_url = None
2600                 try:
2601                         bitrate_list = jsonData[fmt]
2602                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2603                                 bitrate = max(bitrate_list) # select highest
2604
2605                         url_list = jsonData[fmt][bitrate]
2606                 except TypeError: # we have no bitrate info.
2607                         url_list = jsonData[fmt]
2608
2609                 return url_list
2610
2611         def check_urls(self, url_list):
2612                 """Returns 1st active url from list"""
2613                 for url in url_list:
2614                         try:
2615                                 urllib2.urlopen(url)
2616                                 return url
2617                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2618                                 url = None
2619
2620                 return None
2621
2622         def _print_formats(self, formats):
2623                 print 'Available formats:'
2624                 for fmt in formats.keys():
2625                         for b in formats[fmt]:
2626                                 try:
2627                                         ext = formats[fmt][b][0]
2628                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2629                                 except TypeError: # we have no bitrate info
2630                                         ext = formats[fmt][0]
2631                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2632                                         break
2633
2634         def _real_extract(self, url):
2635                 mobj = re.match(self._VALID_URL, url)
2636                 if mobj is None:
2637                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2638                         return
2639                 # extract uploader & filename from url
2640                 uploader = mobj.group(1).decode('utf-8')
2641                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2642
2643                 # construct API request
2644                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2645                 # retrieve .json file with links to files
2646                 request = urllib2.Request(file_url)
2647                 try:
2648                         self.report_download_json(file_url)
2649                         jsonData = urllib2.urlopen(request).read()
2650                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2651                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2652                         return
2653
2654                 # parse JSON
2655                 json_data = json.loads(jsonData)
2656                 player_url = json_data['player_swf_url']
2657                 formats = dict(json_data['audio_formats'])
2658
2659                 req_format = self._downloader.params.get('format', None)
2660                 bitrate = None
2661
2662                 if self._downloader.params.get('listformats', None):
2663                         self._print_formats(formats)
2664                         return
2665
2666                 if req_format is None or req_format == 'best':
2667                         for format_param in formats.keys():
2668                                 url_list = self.get_urls(formats, format_param)
2669                                 # check urls
2670                                 file_url = self.check_urls(url_list)
2671                                 if file_url is not None:
2672                                         break # got it!
2673                 else:
2674                         if req_format not in formats.keys():
2675                                 self._downloader.trouble(u'ERROR: format is not available')
2676                                 return
2677
2678                         url_list = self.get_urls(formats, req_format)
2679                         file_url = self.check_urls(url_list)
2680                         format_param = req_format
2681
2682                 return [{
2683                         'id': file_id.decode('utf-8'),
2684                         'url': file_url.decode('utf-8'),
2685                         'uploader':     uploader.decode('utf-8'),
2686                         'upload_date': u'NA',
2687                         'title': json_data['name'],
2688                         'stitle': simplify_title(json_data['name']),
2689                         'ext': file_url.split('.')[-1].decode('utf-8'),
2690                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2691                         'thumbnail': json_data['thumbnail_url'],
2692                         'description': json_data['description'],
2693                         'player_url': player_url.decode('utf-8'),
2694                 }]
2695
2696 class StanfordOpenClassroomIE(InfoExtractor):
2697         """Information extractor for Stanford's Open ClassRoom"""
2698
2699         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2700         IE_NAME = u'stanfordoc'
2701
2702         def report_download_webpage(self, objid):
2703                 """Report information extraction."""
2704                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2705
2706         def report_extraction(self, video_id):
2707                 """Report information extraction."""
2708                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2709
2710         def _real_extract(self, url):
2711                 mobj = re.match(self._VALID_URL, url)
2712                 if mobj is None:
2713                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2714                         return
2715
2716                 if mobj.group('course') and mobj.group('video'): # A specific video
2717                         course = mobj.group('course')
2718                         video = mobj.group('video')
2719                         info = {
2720                                 'id': simplify_title(course + '_' + video),
2721                         }
2722
2723                         self.report_extraction(info['id'])
2724                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2725                         xmlUrl = baseUrl + video + '.xml'
2726                         try:
2727                                 metaXml = urllib2.urlopen(xmlUrl).read()
2728                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2729                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2730                                 return
2731                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2732                         try:
2733                                 info['title'] = mdoc.findall('./title')[0].text
2734                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2735                         except IndexError:
2736                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2737                                 return
2738                         info['stitle'] = simplify_title(info['title'])
2739                         info['ext'] = info['url'].rpartition('.')[2]
2740                         info['format'] = info['ext']
2741                         return [info]
2742                 elif mobj.group('course'): # A course page
2743                         course = mobj.group('course')
2744                         info = {
2745                                 'id': simplify_title(course),
2746                                 'type': 'playlist',
2747                         }
2748
2749                         self.report_download_webpage(info['id'])
2750                         try:
2751                                 coursepage = urllib2.urlopen(url).read()
2752                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2753                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2754                                 return
2755
2756                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2757                         if m:
2758                                 info['title'] = unescapeHTML(m.group(1))
2759                         else:
2760                                 info['title'] = info['id']
2761                         info['stitle'] = simplify_title(info['title'])
2762
2763                         m = re.search('<description>([^<]+)</description>', coursepage)
2764                         if m:
2765                                 info['description'] = unescapeHTML(m.group(1))
2766
2767                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2768                         info['list'] = [
2769                                 {
2770                                         'type': 'reference',
2771                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2772                                 }
2773                                         for vpage in links]
2774                         results = []
2775                         for entry in info['list']:
2776                                 assert entry['type'] == 'reference'
2777                                 results += self.extract(entry['url'])
2778                         return results
2779
2780                 else: # Root page
2781                         info = {
2782                                 'id': 'Stanford OpenClassroom',
2783                                 'type': 'playlist',
2784                         }
2785
2786                         self.report_download_webpage(info['id'])
2787                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2788                         try:
2789                                 rootpage = urllib2.urlopen(rootURL).read()
2790                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2791                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2792                                 return
2793
2794                         info['title'] = info['id']
2795                         info['stitle'] = simplify_title(info['title'])
2796
2797                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2798                         info['list'] = [
2799                                 {
2800                                         'type': 'reference',
2801                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2802                                 }
2803                                         for cpage in links]
2804
2805                         results = []
2806                         for entry in info['list']:
2807                                 assert entry['type'] == 'reference'
2808                                 results += self.extract(entry['url'])
2809                         return results
2810
2811 class MTVIE(InfoExtractor):
2812         """Information extractor for MTV.com"""
2813
2814         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2815         IE_NAME = u'mtv'
2816
2817         def report_webpage(self, video_id):
2818                 """Report information extraction."""
2819                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2820
2821         def report_extraction(self, video_id):
2822                 """Report information extraction."""
2823                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2824
2825         def _real_extract(self, url):
2826                 mobj = re.match(self._VALID_URL, url)
2827                 if mobj is None:
2828                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2829                         return
2830                 if not mobj.group('proto'):
2831                         url = 'http://' + url
2832                 video_id = mobj.group('videoid')
2833                 self.report_webpage(video_id)
2834
2835                 request = urllib2.Request(url)
2836                 try:
2837                         webpage = urllib2.urlopen(request).read()
2838                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2839                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2840                         return
2841
2842                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2843                 if mobj is None:
2844                         self._downloader.trouble(u'ERROR: unable to extract song name')
2845                         return
2846                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2847                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2848                 if mobj is None:
2849                         self._downloader.trouble(u'ERROR: unable to extract performer')
2850                         return
2851                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2852                 video_title = performer + ' - ' + song_name
2853
2854                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2855                 if mobj is None:
2856                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2857                         return
2858                 mtvn_uri = mobj.group(1)
2859
2860                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2861                 if mobj is None:
2862                         self._downloader.trouble(u'ERROR: unable to extract content id')
2863                         return
2864                 content_id = mobj.group(1)
2865
2866                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2867                 self.report_extraction(video_id)
2868                 request = urllib2.Request(videogen_url)
2869                 try:
2870                         metadataXml = urllib2.urlopen(request).read()
2871                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2872                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2873                         return
2874
2875                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2876                 renditions = mdoc.findall('.//rendition')
2877
2878                 # For now, always pick the highest quality.
2879                 rendition = renditions[-1]
2880
2881                 try:
2882                         _,_,ext = rendition.attrib['type'].partition('/')
2883                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2884                         video_url = rendition.find('./src').text
2885                 except KeyError:
2886                         self._downloader.trouble('Invalid rendition field.')
2887                         return
2888
2889                 info = {
2890                         'id': video_id,
2891                         'url': video_url,
2892                         'uploader': performer,
2893                         'title': video_title,
2894                         'stitle': simplify_title(video_title),
2895                         'ext': ext,
2896                         'format': format,
2897                 }
2898
2899                 return [info]