2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs, urlparse
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
42 uploader: Nickname of the video uploader.
44 ext: Video filename extension.
46 player_url: SWF Player URL (may be None).
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
67 self.set_downloader(downloader)
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
74 """Initializes an instance (authentication, etc)."""
76 self._real_initialize()
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
82 return self._real_extract(url)
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
99 class YoutubeIE(InfoExtractor):
100 """Information extractor for youtube.com."""
104 (?:https?://)? # http(s):// (optional)
105 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
106 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
107 (?:.*?\#/)? # handle anchor (#/) redirect urls
108 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
109 (?: # the various things that can precede the ID:
110 (?:(?:v|embed|e)/) # v/ or embed/ or e/
111 |(?: # or the v= param in all its forms
112 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
113 (?:\?|\#!?) # the params delimiter ? or # or #!
114 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
117 )? # optional -> youtube.com/xxxx is OK
118 )? # all until now is optional -> you can pass the naked ID
119 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
120 (?(1).+)? # if we found the ID, everything can follow
122 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
123 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
124 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
125 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
126 _NETRC_MACHINE = 'youtube'
127 # Listed in order of quality
128 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
129 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
130 _video_extensions = {
136 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
142 _video_dimensions = {
160 def suitable(self, url):
161 """Receives a URL and returns True if suitable for this IE."""
162 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
164 def report_lang(self):
165 """Report attempt to set language."""
166 self._downloader.to_screen(u'[youtube] Setting language')
168 def report_login(self):
169 """Report attempt to log in."""
170 self._downloader.to_screen(u'[youtube] Logging in')
172 def report_age_confirmation(self):
173 """Report attempt to confirm age."""
174 self._downloader.to_screen(u'[youtube] Confirming age')
176 def report_video_webpage_download(self, video_id):
177 """Report attempt to download video webpage."""
178 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
180 def report_video_info_webpage_download(self, video_id):
181 """Report attempt to download video info webpage."""
182 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
184 def report_video_subtitles_download(self, video_id):
185 """Report attempt to download video info webpage."""
186 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
188 def report_information_extraction(self, video_id):
189 """Report attempt to extract video information."""
190 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
192 def report_unavailable_format(self, video_id, format):
193 """Report extracted video URL."""
194 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
196 def report_rtmp_download(self):
197 """Indicate the download will use the RTMP protocol."""
198 self._downloader.to_screen(u'[youtube] RTMP download detected')
200 def _closed_captions_xml_to_srt(self, xml_string):
202 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
203 # TODO parse xml instead of regex
204 for n, (start, dur_tag, dur, caption) in enumerate(texts):
205 if not dur: dur = '4'
207 end = start + float(dur)
208 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
209 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
210 caption = unescapeHTML(caption)
211 caption = unescapeHTML(caption) # double cycle, intentional
212 srt += str(n+1) + '\n'
213 srt += start + ' --> ' + end + '\n'
214 srt += caption + '\n\n'
217 def _print_formats(self, formats):
218 print 'Available formats:'
220 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
222 def _real_initialize(self):
223 if self._downloader is None:
228 downloader_params = self._downloader.params
230 # Attempt to use provided username and password or .netrc data
231 if downloader_params.get('username', None) is not None:
232 username = downloader_params['username']
233 password = downloader_params['password']
234 elif downloader_params.get('usenetrc', False):
236 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
241 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
242 except (IOError, netrc.NetrcParseError), err:
243 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
247 request = urllib2.Request(self._LANG_URL)
250 urllib2.urlopen(request).read()
251 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
252 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
255 # No authentication to be performed
261 'current_form': 'loginForm',
263 'action_login': 'Log In',
264 'username': username,
265 'password': password,
267 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
270 login_results = urllib2.urlopen(request).read()
271 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
272 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
274 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
275 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
281 'action_confirm': 'Confirm',
283 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
285 self.report_age_confirmation()
286 age_results = urllib2.urlopen(request).read()
287 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
288 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
291 def _real_extract(self, url):
292 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
293 mobj = re.search(self._NEXT_URL_RE, url)
295 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
297 # Extract video id from URL
298 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
300 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
302 video_id = mobj.group(2)
305 self.report_video_webpage_download(video_id)
306 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
308 video_webpage = urllib2.urlopen(request).read()
309 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
310 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
313 # Attempt to extract SWF player URL
314 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
316 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
321 self.report_video_info_webpage_download(video_id)
322 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
323 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
324 % (video_id, el_type))
325 request = urllib2.Request(video_info_url)
327 video_info_webpage = urllib2.urlopen(request).read()
328 video_info = parse_qs(video_info_webpage)
329 if 'token' in video_info:
331 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
332 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
334 if 'token' not in video_info:
335 if 'reason' in video_info:
336 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
338 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
341 # Check for "rental" videos
342 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
343 self._downloader.trouble(u'ERROR: "rental" videos not supported')
346 # Start extracting information
347 self.report_information_extraction(video_id)
350 if 'author' not in video_info:
351 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
353 video_uploader = urllib.unquote_plus(video_info['author'][0])
356 if 'title' not in video_info:
357 self._downloader.trouble(u'ERROR: unable to extract video title')
359 video_title = urllib.unquote_plus(video_info['title'][0])
360 video_title = video_title.decode('utf-8')
363 if 'thumbnail_url' not in video_info:
364 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
366 else: # don't panic if we can't find it
367 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
371 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
373 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
374 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
375 for expression in format_expressions:
377 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
382 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
383 if video_description: video_description = clean_html(video_description)
384 else: video_description = ''
387 video_subtitles = None
388 if self._downloader.params.get('writesubtitles', False):
390 self.report_video_subtitles_download(video_id)
391 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
393 srt_list = urllib2.urlopen(request).read()
394 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
395 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
396 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
397 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
398 if not srt_lang_list:
399 raise Trouble(u'WARNING: video has no closed captions')
400 if self._downloader.params.get('subtitleslang', False):
401 srt_lang = self._downloader.params.get('subtitleslang')
402 elif 'en' in srt_lang_list:
405 srt_lang = srt_lang_list.keys()[0]
406 if not srt_lang in srt_lang_list:
407 raise Trouble(u'WARNING: no closed captions found in the specified language')
408 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
410 srt_xml = urllib2.urlopen(request).read()
411 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
412 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
414 raise Trouble(u'WARNING: unable to download video subtitles')
415 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
416 except Trouble as trouble:
417 self._downloader.trouble(trouble[0])
419 if 'length_seconds' not in video_info:
420 self._downloader.trouble(u'WARNING: unable to extract video duration')
423 video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
426 video_token = urllib.unquote_plus(video_info['token'][0])
428 # Decide which formats to download
429 req_format = self._downloader.params.get('format', None)
431 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
432 self.report_rtmp_download()
433 video_url_list = [(None, video_info['conn'][0])]
434 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
435 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
436 url_data = [parse_qs(uds) for uds in url_data_strs]
437 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
438 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
440 format_limit = self._downloader.params.get('format_limit', None)
441 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
442 if format_limit is not None and format_limit in available_formats:
443 format_list = available_formats[available_formats.index(format_limit):]
445 format_list = available_formats
446 existing_formats = [x for x in format_list if x in url_map]
447 if len(existing_formats) == 0:
448 self._downloader.trouble(u'ERROR: no known formats available for video')
450 if self._downloader.params.get('listformats', None):
451 self._print_formats(existing_formats)
453 if req_format is None or req_format == 'best':
454 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
455 elif req_format == 'worst':
456 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
457 elif req_format in ('-1', 'all'):
458 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
460 # Specific formats. We pick the first in a slash-delimeted sequence.
461 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
462 req_formats = req_format.split('/')
463 video_url_list = None
464 for rf in req_formats:
466 video_url_list = [(rf, url_map[rf])]
468 if video_url_list is None:
469 self._downloader.trouble(u'ERROR: requested format not available')
472 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
476 for format_param, video_real_url in video_url_list:
478 video_extension = self._video_extensions.get(format_param, 'flv')
481 'id': video_id.decode('utf-8'),
482 'url': video_real_url.decode('utf-8'),
483 'uploader': video_uploader.decode('utf-8'),
484 'upload_date': upload_date,
485 'title': video_title,
486 'ext': video_extension.decode('utf-8'),
487 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
488 'thumbnail': video_thumbnail.decode('utf-8'),
489 'description': video_description,
490 'player_url': player_url,
491 'subtitles': video_subtitles,
492 'duration': video_duration
497 class MetacafeIE(InfoExtractor):
498 """Information Extractor for metacafe.com."""
500 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
501 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
502 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
503 IE_NAME = u'metacafe'
505 def __init__(self, downloader=None):
506 InfoExtractor.__init__(self, downloader)
508 def report_disclaimer(self):
509 """Report disclaimer retrieval."""
510 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
512 def report_age_confirmation(self):
513 """Report attempt to confirm age."""
514 self._downloader.to_screen(u'[metacafe] Confirming age')
516 def report_download_webpage(self, video_id):
517 """Report webpage download."""
518 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
520 def report_extraction(self, video_id):
521 """Report information extraction."""
522 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
524 def _real_initialize(self):
525 # Retrieve disclaimer
526 request = urllib2.Request(self._DISCLAIMER)
528 self.report_disclaimer()
529 disclaimer = urllib2.urlopen(request).read()
530 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
531 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
537 'submit': "Continue - I'm over 18",
539 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
541 self.report_age_confirmation()
542 disclaimer = urllib2.urlopen(request).read()
543 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
544 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
547 def _real_extract(self, url):
548 # Extract id and simplified title from URL
549 mobj = re.match(self._VALID_URL, url)
551 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
554 video_id = mobj.group(1)
556 # Check if video comes from YouTube
557 mobj2 = re.match(r'^yt-(.*)$', video_id)
558 if mobj2 is not None:
559 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
562 # Retrieve video webpage to extract further information
563 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
565 self.report_download_webpage(video_id)
566 webpage = urllib2.urlopen(request).read()
567 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
568 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
571 # Extract URL, uploader and title from webpage
572 self.report_extraction(video_id)
573 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
575 mediaURL = urllib.unquote(mobj.group(1))
576 video_extension = mediaURL[-3:]
578 # Extract gdaKey if available
579 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
583 gdaKey = mobj.group(1)
584 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
586 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
588 self._downloader.trouble(u'ERROR: unable to extract media URL')
590 vardict = parse_qs(mobj.group(1))
591 if 'mediaData' not in vardict:
592 self._downloader.trouble(u'ERROR: unable to extract media URL')
594 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
596 self._downloader.trouble(u'ERROR: unable to extract media URL')
598 mediaURL = mobj.group(1).replace('\\/', '/')
599 video_extension = mediaURL[-3:]
600 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
602 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
604 self._downloader.trouble(u'ERROR: unable to extract title')
606 video_title = mobj.group(1).decode('utf-8')
608 mobj = re.search(r'submitter=(.*?);', webpage)
610 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
612 video_uploader = mobj.group(1)
615 'id': video_id.decode('utf-8'),
616 'url': video_url.decode('utf-8'),
617 'uploader': video_uploader.decode('utf-8'),
618 'upload_date': u'NA',
619 'title': video_title,
620 'ext': video_extension.decode('utf-8'),
626 class DailymotionIE(InfoExtractor):
627 """Information Extractor for Dailymotion"""
629 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
630 IE_NAME = u'dailymotion'
632 def __init__(self, downloader=None):
633 InfoExtractor.__init__(self, downloader)
635 def report_download_webpage(self, video_id):
636 """Report webpage download."""
637 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
639 def report_extraction(self, video_id):
640 """Report information extraction."""
641 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
643 def _real_extract(self, url):
644 # Extract id and simplified title from URL
645 mobj = re.match(self._VALID_URL, url)
647 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
650 video_id = mobj.group(1).split('_')[0].split('?')[0]
652 video_extension = 'mp4'
654 # Retrieve video webpage to extract further information
655 request = urllib2.Request(url)
656 request.add_header('Cookie', 'family_filter=off')
658 self.report_download_webpage(video_id)
659 webpage = urllib2.urlopen(request).read()
660 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
661 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
664 # Extract URL, uploader and title from webpage
665 self.report_extraction(video_id)
666 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
668 self._downloader.trouble(u'ERROR: unable to extract media URL')
670 flashvars = urllib.unquote(mobj.group(1))
672 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
675 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
678 self._downloader.trouble(u'ERROR: unable to extract video URL')
681 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
683 self._downloader.trouble(u'ERROR: unable to extract video URL')
686 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
688 # TODO: support choosing qualities
690 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
692 self._downloader.trouble(u'ERROR: unable to extract title')
694 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
696 video_uploader = u'NA'
697 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
699 # lookin for official user
700 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
701 if mobj_official is None:
702 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
704 video_uploader = mobj_official.group(1)
706 video_uploader = mobj.group(1)
708 video_upload_date = u'NA'
709 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
711 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
714 'id': video_id.decode('utf-8'),
715 'url': video_url.decode('utf-8'),
716 'uploader': video_uploader.decode('utf-8'),
717 'upload_date': video_upload_date,
718 'title': video_title,
719 'ext': video_extension.decode('utf-8'),
725 class GoogleIE(InfoExtractor):
726 """Information extractor for video.google.com."""
728 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
729 IE_NAME = u'video.google'
731 def __init__(self, downloader=None):
732 InfoExtractor.__init__(self, downloader)
734 def report_download_webpage(self, video_id):
735 """Report webpage download."""
736 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
738 def report_extraction(self, video_id):
739 """Report information extraction."""
740 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
742 def _real_extract(self, url):
743 # Extract id from URL
744 mobj = re.match(self._VALID_URL, url)
746 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
749 video_id = mobj.group(1)
751 video_extension = 'mp4'
753 # Retrieve video webpage to extract further information
754 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
756 self.report_download_webpage(video_id)
757 webpage = urllib2.urlopen(request).read()
758 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
759 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
762 # Extract URL, uploader, and title from webpage
763 self.report_extraction(video_id)
764 mobj = re.search(r"download_url:'([^']+)'", webpage)
766 video_extension = 'flv'
767 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
769 self._downloader.trouble(u'ERROR: unable to extract media URL')
771 mediaURL = urllib.unquote(mobj.group(1))
772 mediaURL = mediaURL.replace('\\x3d', '\x3d')
773 mediaURL = mediaURL.replace('\\x26', '\x26')
777 mobj = re.search(r'<title>(.*)</title>', webpage)
779 self._downloader.trouble(u'ERROR: unable to extract title')
781 video_title = mobj.group(1).decode('utf-8')
783 # Extract video description
784 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
786 self._downloader.trouble(u'ERROR: unable to extract video description')
788 video_description = mobj.group(1).decode('utf-8')
789 if not video_description:
790 video_description = 'No description available.'
792 # Extract video thumbnail
793 if self._downloader.params.get('forcethumbnail', False):
794 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
796 webpage = urllib2.urlopen(request).read()
797 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
798 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
800 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
802 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
804 video_thumbnail = mobj.group(1)
805 else: # we need something to pass to process_info
809 'id': video_id.decode('utf-8'),
810 'url': video_url.decode('utf-8'),
812 'upload_date': u'NA',
813 'title': video_title,
814 'ext': video_extension.decode('utf-8'),
820 class PhotobucketIE(InfoExtractor):
821 """Information extractor for photobucket.com."""
823 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
824 IE_NAME = u'photobucket'
826 def __init__(self, downloader=None):
827 InfoExtractor.__init__(self, downloader)
829 def report_download_webpage(self, video_id):
830 """Report webpage download."""
831 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
833 def report_extraction(self, video_id):
834 """Report information extraction."""
835 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
837 def _real_extract(self, url):
838 # Extract id from URL
839 mobj = re.match(self._VALID_URL, url)
841 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
844 video_id = mobj.group(1)
846 video_extension = 'flv'
848 # Retrieve video webpage to extract further information
849 request = urllib2.Request(url)
851 self.report_download_webpage(video_id)
852 webpage = urllib2.urlopen(request).read()
853 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
854 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
857 # Extract URL, uploader, and title from webpage
858 self.report_extraction(video_id)
859 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
861 self._downloader.trouble(u'ERROR: unable to extract media URL')
863 mediaURL = urllib.unquote(mobj.group(1))
867 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
869 self._downloader.trouble(u'ERROR: unable to extract title')
871 video_title = mobj.group(1).decode('utf-8')
873 video_uploader = mobj.group(2).decode('utf-8')
876 'id': video_id.decode('utf-8'),
877 'url': video_url.decode('utf-8'),
878 'uploader': video_uploader,
879 'upload_date': u'NA',
880 'title': video_title,
881 'ext': video_extension.decode('utf-8'),
887 class YahooIE(InfoExtractor):
888 """Information extractor for video.yahoo.com."""
890 # _VALID_URL matches all Yahoo! Video URLs
891 # _VPAGE_URL matches only the extractable '/watch/' URLs
892 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
893 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
894 IE_NAME = u'video.yahoo'
896 def __init__(self, downloader=None):
897 InfoExtractor.__init__(self, downloader)
899 def report_download_webpage(self, video_id):
900 """Report webpage download."""
901 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
903 def report_extraction(self, video_id):
904 """Report information extraction."""
905 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
907 def _real_extract(self, url, new_video=True):
908 # Extract ID from URL
909 mobj = re.match(self._VALID_URL, url)
911 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
914 video_id = mobj.group(2)
915 video_extension = 'flv'
917 # Rewrite valid but non-extractable URLs as
918 # extractable English language /watch/ URLs
919 if re.match(self._VPAGE_URL, url) is None:
920 request = urllib2.Request(url)
922 webpage = urllib2.urlopen(request).read()
923 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
924 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
927 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
929 self._downloader.trouble(u'ERROR: Unable to extract id field')
931 yahoo_id = mobj.group(1)
933 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
935 self._downloader.trouble(u'ERROR: Unable to extract vid field')
937 yahoo_vid = mobj.group(1)
939 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
940 return self._real_extract(url, new_video=False)
942 # Retrieve video webpage to extract further information
943 request = urllib2.Request(url)
945 self.report_download_webpage(video_id)
946 webpage = urllib2.urlopen(request).read()
947 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
948 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
951 # Extract uploader and title from webpage
952 self.report_extraction(video_id)
953 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
955 self._downloader.trouble(u'ERROR: unable to extract video title')
957 video_title = mobj.group(1).decode('utf-8')
959 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
961 self._downloader.trouble(u'ERROR: unable to extract video uploader')
963 video_uploader = mobj.group(1).decode('utf-8')
965 # Extract video thumbnail
966 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
968 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
970 video_thumbnail = mobj.group(1).decode('utf-8')
972 # Extract video description
973 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
975 self._downloader.trouble(u'ERROR: unable to extract video description')
977 video_description = mobj.group(1).decode('utf-8')
978 if not video_description:
979 video_description = 'No description available.'
981 # Extract video height and width
982 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
984 self._downloader.trouble(u'ERROR: unable to extract video height')
986 yv_video_height = mobj.group(1)
988 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
990 self._downloader.trouble(u'ERROR: unable to extract video width')
992 yv_video_width = mobj.group(1)
994 # Retrieve video playlist to extract media URL
995 # I'm not completely sure what all these options are, but we
996 # seem to need most of them, otherwise the server sends a 401.
997 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
998 yv_bitrate = '700' # according to Wikipedia this is hard-coded
999 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1000 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1001 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1003 self.report_download_webpage(video_id)
1004 webpage = urllib2.urlopen(request).read()
1005 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1006 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1009 # Extract media URL from playlist XML
1010 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1012 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1014 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1015 video_url = unescapeHTML(video_url)
1018 'id': video_id.decode('utf-8'),
1020 'uploader': video_uploader,
1021 'upload_date': u'NA',
1022 'title': video_title,
1023 'ext': video_extension.decode('utf-8'),
1024 'thumbnail': video_thumbnail.decode('utf-8'),
1025 'description': video_description,
1026 'thumbnail': video_thumbnail,
1031 class VimeoIE(InfoExtractor):
1032 """Information extractor for vimeo.com."""
1034 # _VALID_URL matches Vimeo URLs
1035 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1038 def __init__(self, downloader=None):
1039 InfoExtractor.__init__(self, downloader)
1041 def report_download_webpage(self, video_id):
1042 """Report webpage download."""
1043 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1045 def report_extraction(self, video_id):
1046 """Report information extraction."""
1047 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1049 def _real_extract(self, url, new_video=True):
1050 # Extract ID from URL
1051 mobj = re.match(self._VALID_URL, url)
1053 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1056 video_id = mobj.group(1)
1058 # Retrieve video webpage to extract further information
1059 request = urllib2.Request(url, None, std_headers)
1061 self.report_download_webpage(video_id)
1062 webpage = urllib2.urlopen(request).read()
1063 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1064 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1067 # Now we begin extracting as much information as we can from what we
1068 # retrieved. First we extract the information common to all extractors,
1069 # and latter we extract those that are Vimeo specific.
1070 self.report_extraction(video_id)
1072 # Extract the config JSON
1073 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1075 config = json.loads(config)
1077 self._downloader.trouble(u'ERROR: unable to extract info section')
1081 video_title = config["video"]["title"]
1084 video_uploader = config["video"]["owner"]["name"]
1086 # Extract video thumbnail
1087 video_thumbnail = config["video"]["thumbnail"]
1089 # Extract video description
1090 video_description = get_element_by_id("description", webpage.decode('utf8'))
1091 if video_description: video_description = clean_html(video_description)
1092 else: video_description = ''
1094 # Extract upload date
1095 video_upload_date = u'NA'
1096 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1097 if mobj is not None:
1098 video_upload_date = mobj.group(1)
1100 # Vimeo specific: extract request signature and timestamp
1101 sig = config['request']['signature']
1102 timestamp = config['request']['timestamp']
1104 # Vimeo specific: extract video codec and quality information
1105 # TODO bind to format param
1106 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1107 for codec in codecs:
1108 if codec[0] in config["video"]["files"]:
1109 video_codec = codec[0]
1110 video_extension = codec[1]
1111 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1112 else: quality = 'sd'
1115 self._downloader.trouble(u'ERROR: no known codec found')
1118 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1119 %(video_id, sig, timestamp, quality, video_codec.upper())
1124 'uploader': video_uploader,
1125 'upload_date': video_upload_date,
1126 'title': video_title,
1127 'ext': video_extension,
1128 'thumbnail': video_thumbnail,
1129 'description': video_description,
1134 class GenericIE(InfoExtractor):
1135 """Generic last-resort information extractor."""
1138 IE_NAME = u'generic'
1140 def __init__(self, downloader=None):
1141 InfoExtractor.__init__(self, downloader)
1143 def report_download_webpage(self, video_id):
1144 """Report webpage download."""
1145 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1146 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1148 def report_extraction(self, video_id):
1149 """Report information extraction."""
1150 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1152 def report_following_redirect(self, new_url):
1153 """Report information extraction."""
1154 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1156 def _test_redirect(self, url):
1157 """Check if it is a redirect, like url shorteners, in case restart chain."""
1158 class HeadRequest(urllib2.Request):
1159 def get_method(self):
1162 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1164 Subclass the HTTPRedirectHandler to make it use our
1165 HeadRequest also on the redirected URL
1167 def redirect_request(self, req, fp, code, msg, headers, newurl):
1168 if code in (301, 302, 303, 307):
1169 newurl = newurl.replace(' ', '%20')
1170 newheaders = dict((k,v) for k,v in req.headers.items()
1171 if k.lower() not in ("content-length", "content-type"))
1172 return HeadRequest(newurl,
1174 origin_req_host=req.get_origin_req_host(),
1177 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1179 class HTTPMethodFallback(urllib2.BaseHandler):
1181 Fallback to GET if HEAD is not allowed (405 HTTP error)
1183 def http_error_405(self, req, fp, code, msg, headers):
1187 newheaders = dict((k,v) for k,v in req.headers.items()
1188 if k.lower() not in ("content-length", "content-type"))
1189 return self.parent.open(urllib2.Request(req.get_full_url(),
1191 origin_req_host=req.get_origin_req_host(),
1195 opener = urllib2.OpenerDirector()
1196 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1197 HTTPMethodFallback, HEADRedirectHandler,
1198 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1199 opener.add_handler(handler())
1201 response = opener.open(HeadRequest(url))
1202 new_url = response.geturl()
1204 if url == new_url: return False
1206 self.report_following_redirect(new_url)
1207 self._downloader.download([new_url])
1210 def _real_extract(self, url):
1211 if self._test_redirect(url): return
1213 video_id = url.split('/')[-1]
1214 request = urllib2.Request(url)
1216 self.report_download_webpage(video_id)
1217 webpage = urllib2.urlopen(request).read()
1218 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1219 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1221 except ValueError, err:
1222 # since this is the last-resort InfoExtractor, if
1223 # this error is thrown, it'll be thrown here
1224 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1227 self.report_extraction(video_id)
1228 # Start with something easy: JW Player in SWFObject
1229 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1231 # Broaden the search a little bit
1232 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1234 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1237 # It's possible that one of the regexes
1238 # matched, but returned an empty group:
1239 if mobj.group(1) is None:
1240 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1243 video_url = urllib.unquote(mobj.group(1))
1244 video_id = os.path.basename(video_url)
1246 # here's a fun little line of code for you:
1247 video_extension = os.path.splitext(video_id)[1][1:]
1248 video_id = os.path.splitext(video_id)[0]
1250 # it's tempting to parse this further, but you would
1251 # have to take into account all the variations like
1252 # Video Title - Site Name
1253 # Site Name | Video Title
1254 # Video Title - Tagline | Site Name
1255 # and so on and so forth; it's just not practical
1256 mobj = re.search(r'<title>(.*)</title>', webpage)
1258 self._downloader.trouble(u'ERROR: unable to extract title')
1260 video_title = mobj.group(1).decode('utf-8')
1262 # video uploader is domain name
1263 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1265 self._downloader.trouble(u'ERROR: unable to extract title')
1267 video_uploader = mobj.group(1).decode('utf-8')
1270 'id': video_id.decode('utf-8'),
1271 'url': video_url.decode('utf-8'),
1272 'uploader': video_uploader,
1273 'upload_date': u'NA',
1274 'title': video_title,
1275 'ext': video_extension.decode('utf-8'),
1281 class YoutubeSearchIE(InfoExtractor):
1282 """Information Extractor for YouTube search queries."""
1283 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1284 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1285 _max_youtube_results = 1000
1286 IE_NAME = u'youtube:search'
1288 def __init__(self, downloader=None):
1289 InfoExtractor.__init__(self, downloader)
1291 def report_download_page(self, query, pagenum):
1292 """Report attempt to download search page with given number."""
1293 query = query.decode(preferredencoding())
1294 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1296 def _real_extract(self, query):
1297 mobj = re.match(self._VALID_URL, query)
1299 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1302 prefix, query = query.split(':')
1304 query = query.encode('utf-8')
1306 self._download_n_results(query, 1)
1308 elif prefix == 'all':
1309 self._download_n_results(query, self._max_youtube_results)
1315 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1317 elif n > self._max_youtube_results:
1318 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1319 n = self._max_youtube_results
1320 self._download_n_results(query, n)
1322 except ValueError: # parsing prefix as integer fails
1323 self._download_n_results(query, 1)
1326 def _download_n_results(self, query, n):
1327 """Downloads a specified number of results for a query"""
1333 while (50 * pagenum) < limit:
1334 self.report_download_page(query, pagenum+1)
1335 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1336 request = urllib2.Request(result_url)
1338 data = urllib2.urlopen(request).read()
1339 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1340 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1342 api_response = json.loads(data)['data']
1344 new_ids = list(video['id'] for video in api_response['items'])
1345 video_ids += new_ids
1347 limit = min(n, api_response['totalItems'])
1350 if len(video_ids) > n:
1351 video_ids = video_ids[:n]
1352 for id in video_ids:
1353 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1357 class GoogleSearchIE(InfoExtractor):
1358 """Information Extractor for Google Video search queries."""
1359 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1360 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1361 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1362 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1363 _max_google_results = 1000
1364 IE_NAME = u'video.google:search'
1366 def __init__(self, downloader=None):
1367 InfoExtractor.__init__(self, downloader)
1369 def report_download_page(self, query, pagenum):
1370 """Report attempt to download playlist page with given number."""
1371 query = query.decode(preferredencoding())
1372 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1374 def _real_extract(self, query):
1375 mobj = re.match(self._VALID_URL, query)
1377 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1380 prefix, query = query.split(':')
1382 query = query.encode('utf-8')
1384 self._download_n_results(query, 1)
1386 elif prefix == 'all':
1387 self._download_n_results(query, self._max_google_results)
1393 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1395 elif n > self._max_google_results:
1396 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1397 n = self._max_google_results
1398 self._download_n_results(query, n)
1400 except ValueError: # parsing prefix as integer fails
1401 self._download_n_results(query, 1)
1404 def _download_n_results(self, query, n):
1405 """Downloads a specified number of results for a query"""
1411 self.report_download_page(query, pagenum)
1412 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1413 request = urllib2.Request(result_url)
1415 page = urllib2.urlopen(request).read()
1416 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1417 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1420 # Extract video identifiers
1421 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1422 video_id = mobj.group(1)
1423 if video_id not in video_ids:
1424 video_ids.append(video_id)
1425 if len(video_ids) == n:
1426 # Specified n videos reached
1427 for id in video_ids:
1428 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1431 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1432 for id in video_ids:
1433 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1436 pagenum = pagenum + 1
1439 class YahooSearchIE(InfoExtractor):
1440 """Information Extractor for Yahoo! Video search queries."""
1441 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1442 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1443 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1444 _MORE_PAGES_INDICATOR = r'\s*Next'
1445 _max_yahoo_results = 1000
1446 IE_NAME = u'video.yahoo:search'
1448 def __init__(self, downloader=None):
1449 InfoExtractor.__init__(self, downloader)
1451 def report_download_page(self, query, pagenum):
1452 """Report attempt to download playlist page with given number."""
1453 query = query.decode(preferredencoding())
1454 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1456 def _real_extract(self, query):
1457 mobj = re.match(self._VALID_URL, query)
1459 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1462 prefix, query = query.split(':')
1464 query = query.encode('utf-8')
1466 self._download_n_results(query, 1)
1468 elif prefix == 'all':
1469 self._download_n_results(query, self._max_yahoo_results)
1475 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1477 elif n > self._max_yahoo_results:
1478 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1479 n = self._max_yahoo_results
1480 self._download_n_results(query, n)
1482 except ValueError: # parsing prefix as integer fails
1483 self._download_n_results(query, 1)
1486 def _download_n_results(self, query, n):
1487 """Downloads a specified number of results for a query"""
1490 already_seen = set()
1494 self.report_download_page(query, pagenum)
1495 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1496 request = urllib2.Request(result_url)
1498 page = urllib2.urlopen(request).read()
1499 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1500 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1503 # Extract video identifiers
1504 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1505 video_id = mobj.group(1)
1506 if video_id not in already_seen:
1507 video_ids.append(video_id)
1508 already_seen.add(video_id)
1509 if len(video_ids) == n:
1510 # Specified n videos reached
1511 for id in video_ids:
1512 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1515 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1516 for id in video_ids:
1517 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1520 pagenum = pagenum + 1
1523 class YoutubePlaylistIE(InfoExtractor):
1524 """Information Extractor for YouTube playlists."""
1526 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1527 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1528 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1529 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1530 IE_NAME = u'youtube:playlist'
1532 def __init__(self, downloader=None):
1533 InfoExtractor.__init__(self, downloader)
1535 def report_download_page(self, playlist_id, pagenum):
1536 """Report attempt to download playlist page with given number."""
1537 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1539 def _real_extract(self, url):
1540 # Extract playlist id
1541 mobj = re.match(self._VALID_URL, url)
1543 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1547 if mobj.group(3) is not None:
1548 self._downloader.download([mobj.group(3)])
1551 # Download playlist pages
1552 # prefix is 'p' as default for playlists but there are other types that need extra care
1553 playlist_prefix = mobj.group(1)
1554 if playlist_prefix == 'a':
1555 playlist_access = 'artist'
1557 playlist_prefix = 'p'
1558 playlist_access = 'view_play_list'
1559 playlist_id = mobj.group(2)
1564 self.report_download_page(playlist_id, pagenum)
1565 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1566 request = urllib2.Request(url)
1568 page = urllib2.urlopen(request).read()
1569 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1570 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1573 # Extract video identifiers
1575 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1576 if mobj.group(1) not in ids_in_page:
1577 ids_in_page.append(mobj.group(1))
1578 video_ids.extend(ids_in_page)
1580 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1582 pagenum = pagenum + 1
1584 playliststart = self._downloader.params.get('playliststart', 1) - 1
1585 playlistend = self._downloader.params.get('playlistend', -1)
1586 if playlistend == -1:
1587 video_ids = video_ids[playliststart:]
1589 video_ids = video_ids[playliststart:playlistend]
1591 for id in video_ids:
1592 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1596 class YoutubeChannelIE(InfoExtractor):
1597 """Information Extractor for YouTube channels."""
1599 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1600 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1601 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1602 IE_NAME = u'youtube:channel'
1604 def report_download_page(self, channel_id, pagenum):
1605 """Report attempt to download channel page with given number."""
1606 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1608 def _real_extract(self, url):
1609 # Extract channel id
1610 mobj = re.match(self._VALID_URL, url)
1612 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1615 # Download channel pages
1616 channel_id = mobj.group(1)
1621 self.report_download_page(channel_id, pagenum)
1622 url = self._TEMPLATE_URL % (channel_id, pagenum)
1623 request = urllib2.Request(url)
1625 page = urllib2.urlopen(request).read()
1626 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1627 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1630 # Extract video identifiers
1632 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1633 if mobj.group(1) not in ids_in_page:
1634 ids_in_page.append(mobj.group(1))
1635 video_ids.extend(ids_in_page)
1637 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1639 pagenum = pagenum + 1
1641 for id in video_ids:
1642 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1646 class YoutubeUserIE(InfoExtractor):
1647 """Information Extractor for YouTube users."""
1649 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1650 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1651 _GDATA_PAGE_SIZE = 50
1652 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1653 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1654 IE_NAME = u'youtube:user'
1656 def __init__(self, downloader=None):
1657 InfoExtractor.__init__(self, downloader)
1659 def report_download_page(self, username, start_index):
1660 """Report attempt to download user page."""
1661 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1662 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1664 def _real_extract(self, url):
1666 mobj = re.match(self._VALID_URL, url)
1668 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1671 username = mobj.group(1)
1673 # Download video ids using YouTube Data API. Result size per
1674 # query is limited (currently to 50 videos) so we need to query
1675 # page by page until there are no video ids - it means we got
1682 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1683 self.report_download_page(username, start_index)
1685 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1688 page = urllib2.urlopen(request).read()
1689 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1690 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1693 # Extract video identifiers
1696 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1697 if mobj.group(1) not in ids_in_page:
1698 ids_in_page.append(mobj.group(1))
1700 video_ids.extend(ids_in_page)
1702 # A little optimization - if current page is not
1703 # "full", ie. does not contain PAGE_SIZE video ids then
1704 # we can assume that this page is the last one - there
1705 # are no more ids on further pages - no need to query
1708 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1713 all_ids_count = len(video_ids)
1714 playliststart = self._downloader.params.get('playliststart', 1) - 1
1715 playlistend = self._downloader.params.get('playlistend', -1)
1717 if playlistend == -1:
1718 video_ids = video_ids[playliststart:]
1720 video_ids = video_ids[playliststart:playlistend]
1722 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1723 (username, all_ids_count, len(video_ids)))
1725 for video_id in video_ids:
1726 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1729 class BlipTVUserIE(InfoExtractor):
1730 """Information Extractor for blip.tv users."""
1732 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1734 IE_NAME = u'blip.tv:user'
1736 def __init__(self, downloader=None):
1737 InfoExtractor.__init__(self, downloader)
1739 def report_download_page(self, username, pagenum):
1740 """Report attempt to download user page."""
1741 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1742 (self.IE_NAME, username, pagenum))
1744 def _real_extract(self, url):
1746 mobj = re.match(self._VALID_URL, url)
1748 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1751 username = mobj.group(1)
1753 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1755 request = urllib2.Request(url)
1758 page = urllib2.urlopen(request).read().decode('utf-8')
1759 mobj = re.search(r'data-users-id="([^"]+)"', page)
1760 page_base = page_base % mobj.group(1)
1761 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1762 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1766 # Download video ids using BlipTV Ajax calls. Result size per
1767 # query is limited (currently to 12 videos) so we need to query
1768 # page by page until there are no video ids - it means we got
1775 self.report_download_page(username, pagenum)
1777 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1780 page = urllib2.urlopen(request).read().decode('utf-8')
1781 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1782 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1785 # Extract video identifiers
1788 for mobj in re.finditer(r'href="/([^"]+)"', page):
1789 if mobj.group(1) not in ids_in_page:
1790 ids_in_page.append(unescapeHTML(mobj.group(1)))
1792 video_ids.extend(ids_in_page)
1794 # A little optimization - if current page is not
1795 # "full", ie. does not contain PAGE_SIZE video ids then
1796 # we can assume that this page is the last one - there
1797 # are no more ids on further pages - no need to query
1800 if len(ids_in_page) < self._PAGE_SIZE:
1805 all_ids_count = len(video_ids)
1806 playliststart = self._downloader.params.get('playliststart', 1) - 1
1807 playlistend = self._downloader.params.get('playlistend', -1)
1809 if playlistend == -1:
1810 video_ids = video_ids[playliststart:]
1812 video_ids = video_ids[playliststart:playlistend]
1814 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1815 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1817 for video_id in video_ids:
1818 self._downloader.download([u'http://blip.tv/'+video_id])
1821 class DepositFilesIE(InfoExtractor):
1822 """Information extractor for depositfiles.com"""
1824 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1825 IE_NAME = u'DepositFiles'
1827 def __init__(self, downloader=None):
1828 InfoExtractor.__init__(self, downloader)
1830 def report_download_webpage(self, file_id):
1831 """Report webpage download."""
1832 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1834 def report_extraction(self, file_id):
1835 """Report information extraction."""
1836 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1838 def _real_extract(self, url):
1839 file_id = url.split('/')[-1]
1840 # Rebuild url in english locale
1841 url = 'http://depositfiles.com/en/files/' + file_id
1843 # Retrieve file webpage with 'Free download' button pressed
1844 free_download_indication = { 'gateway_result' : '1' }
1845 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1847 self.report_download_webpage(file_id)
1848 webpage = urllib2.urlopen(request).read()
1849 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1850 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1853 # Search for the real file URL
1854 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1855 if (mobj is None) or (mobj.group(1) is None):
1856 # Try to figure out reason of the error.
1857 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1858 if (mobj is not None) and (mobj.group(1) is not None):
1859 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1860 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1862 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1865 file_url = mobj.group(1)
1866 file_extension = os.path.splitext(file_url)[1][1:]
1868 # Search for file title
1869 mobj = re.search(r'<b title="(.*?)">', webpage)
1871 self._downloader.trouble(u'ERROR: unable to extract title')
1873 file_title = mobj.group(1).decode('utf-8')
1876 'id': file_id.decode('utf-8'),
1877 'url': file_url.decode('utf-8'),
1879 'upload_date': u'NA',
1880 'title': file_title,
1881 'ext': file_extension.decode('utf-8'),
1887 class FacebookIE(InfoExtractor):
1888 """Information Extractor for Facebook"""
1890 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1891 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1892 _NETRC_MACHINE = 'facebook'
1893 _available_formats = ['video', 'highqual', 'lowqual']
1894 _video_extensions = {
1899 IE_NAME = u'facebook'
1901 def __init__(self, downloader=None):
1902 InfoExtractor.__init__(self, downloader)
1904 def _reporter(self, message):
1905 """Add header and report message."""
1906 self._downloader.to_screen(u'[facebook] %s' % message)
1908 def report_login(self):
1909 """Report attempt to log in."""
1910 self._reporter(u'Logging in')
1912 def report_video_webpage_download(self, video_id):
1913 """Report attempt to download video webpage."""
1914 self._reporter(u'%s: Downloading video webpage' % video_id)
1916 def report_information_extraction(self, video_id):
1917 """Report attempt to extract video information."""
1918 self._reporter(u'%s: Extracting video information' % video_id)
1920 def _parse_page(self, video_webpage):
1921 """Extract video information from page"""
1923 data = {'title': r'\("video_title", "(.*?)"\)',
1924 'description': r'<div class="datawrap">(.*?)</div>',
1925 'owner': r'\("video_owner_name", "(.*?)"\)',
1926 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1929 for piece in data.keys():
1930 mobj = re.search(data[piece], video_webpage)
1931 if mobj is not None:
1932 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1936 for fmt in self._available_formats:
1937 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1938 if mobj is not None:
1939 # URL is in a Javascript segment inside an escaped Unicode format within
1940 # the generally utf-8 page
1941 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1942 video_info['video_urls'] = video_urls
1946 def _real_initialize(self):
1947 if self._downloader is None:
1952 downloader_params = self._downloader.params
1954 # Attempt to use provided username and password or .netrc data
1955 if downloader_params.get('username', None) is not None:
1956 useremail = downloader_params['username']
1957 password = downloader_params['password']
1958 elif downloader_params.get('usenetrc', False):
1960 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1961 if info is not None:
1965 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1966 except (IOError, netrc.NetrcParseError), err:
1967 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1970 if useremail is None:
1979 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1982 login_results = urllib2.urlopen(request).read()
1983 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1984 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1986 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1987 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1990 def _real_extract(self, url):
1991 mobj = re.match(self._VALID_URL, url)
1993 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1995 video_id = mobj.group('ID')
1998 self.report_video_webpage_download(video_id)
1999 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2001 page = urllib2.urlopen(request)
2002 video_webpage = page.read()
2003 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2004 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2007 # Start extracting information
2008 self.report_information_extraction(video_id)
2010 # Extract information
2011 video_info = self._parse_page(video_webpage)
2014 if 'owner' not in video_info:
2015 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2017 video_uploader = video_info['owner']
2020 if 'title' not in video_info:
2021 self._downloader.trouble(u'ERROR: unable to extract video title')
2023 video_title = video_info['title']
2024 video_title = video_title.decode('utf-8')
2027 if 'thumbnail' not in video_info:
2028 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2029 video_thumbnail = ''
2031 video_thumbnail = video_info['thumbnail']
2035 if 'upload_date' in video_info:
2036 upload_time = video_info['upload_date']
2037 timetuple = email.utils.parsedate_tz(upload_time)
2038 if timetuple is not None:
2040 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2045 video_description = video_info.get('description', 'No description available.')
2047 url_map = video_info['video_urls']
2048 if len(url_map.keys()) > 0:
2049 # Decide which formats to download
2050 req_format = self._downloader.params.get('format', None)
2051 format_limit = self._downloader.params.get('format_limit', None)
2053 if format_limit is not None and format_limit in self._available_formats:
2054 format_list = self._available_formats[self._available_formats.index(format_limit):]
2056 format_list = self._available_formats
2057 existing_formats = [x for x in format_list if x in url_map]
2058 if len(existing_formats) == 0:
2059 self._downloader.trouble(u'ERROR: no known formats available for video')
2061 if req_format is None:
2062 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2063 elif req_format == 'worst':
2064 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2065 elif req_format == '-1':
2066 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2069 if req_format not in url_map:
2070 self._downloader.trouble(u'ERROR: requested format not available')
2072 video_url_list = [(req_format, url_map[req_format])] # Specific format
2075 for format_param, video_real_url in video_url_list:
2077 video_extension = self._video_extensions.get(format_param, 'mp4')
2080 'id': video_id.decode('utf-8'),
2081 'url': video_real_url.decode('utf-8'),
2082 'uploader': video_uploader.decode('utf-8'),
2083 'upload_date': upload_date,
2084 'title': video_title,
2085 'ext': video_extension.decode('utf-8'),
2086 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2087 'thumbnail': video_thumbnail.decode('utf-8'),
2088 'description': video_description.decode('utf-8'),
2093 class BlipTVIE(InfoExtractor):
2094 """Information extractor for blip.tv"""
2096 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2097 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2098 IE_NAME = u'blip.tv'
2100 def report_extraction(self, file_id):
2101 """Report information extraction."""
2102 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2104 def report_direct_download(self, title):
2105 """Report information extraction."""
2106 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2108 def _real_extract(self, url):
2109 mobj = re.match(self._VALID_URL, url)
2111 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2118 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2119 request = urllib2.Request(json_url.encode('utf-8'))
2120 self.report_extraction(mobj.group(1))
2123 urlh = urllib2.urlopen(request)
2124 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2125 basename = url.split('/')[-1]
2126 title,ext = os.path.splitext(basename)
2127 title = title.decode('UTF-8')
2128 ext = ext.replace('.', '')
2129 self.report_direct_download(title)
2137 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2138 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2140 if info is None: # Regular URL
2142 json_code = urlh.read()
2143 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2144 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2148 json_data = json.loads(json_code)
2149 if 'Post' in json_data:
2150 data = json_data['Post']
2154 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2155 video_url = data['media']['url']
2156 umobj = re.match(self._URL_EXT, video_url)
2158 raise ValueError('Can not determine filename extension')
2159 ext = umobj.group(1)
2162 'id': data['item_id'],
2164 'uploader': data['display_name'],
2165 'upload_date': upload_date,
2166 'title': data['title'],
2168 'format': data['media']['mimeType'],
2169 'thumbnail': data['thumbnailUrl'],
2170 'description': data['description'],
2171 'player_url': data['embedUrl']
2173 except (ValueError,KeyError), err:
2174 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2177 std_headers['User-Agent'] = 'iTunes/10.6.1'
2181 class MyVideoIE(InfoExtractor):
2182 """Information Extractor for myvideo.de."""
2184 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2185 IE_NAME = u'myvideo'
2187 def __init__(self, downloader=None):
2188 InfoExtractor.__init__(self, downloader)
2190 def report_download_webpage(self, video_id):
2191 """Report webpage download."""
2192 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2194 def report_extraction(self, video_id):
2195 """Report information extraction."""
2196 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2198 def _real_extract(self,url):
2199 mobj = re.match(self._VALID_URL, url)
2201 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2204 video_id = mobj.group(1)
2207 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2209 self.report_download_webpage(video_id)
2210 webpage = urllib2.urlopen(request).read()
2211 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2212 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2215 self.report_extraction(video_id)
2216 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2219 self._downloader.trouble(u'ERROR: unable to extract media URL')
2221 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2223 mobj = re.search('<title>([^<]+)</title>', webpage)
2225 self._downloader.trouble(u'ERROR: unable to extract title')
2228 video_title = mobj.group(1)
2234 'upload_date': u'NA',
2235 'title': video_title,
2241 class ComedyCentralIE(InfoExtractor):
2242 """Information extractor for The Daily Show and Colbert Report """
2244 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2245 IE_NAME = u'comedycentral'
2247 def report_extraction(self, episode_id):
2248 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2250 def report_config_download(self, episode_id):
2251 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2253 def report_index_download(self, episode_id):
2254 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2256 def report_player_url(self, episode_id):
2257 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2259 def _real_extract(self, url):
2260 mobj = re.match(self._VALID_URL, url)
2262 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2265 if mobj.group('shortname'):
2266 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2267 url = u'http://www.thedailyshow.com/full-episodes/'
2269 url = u'http://www.colbertnation.com/full-episodes/'
2270 mobj = re.match(self._VALID_URL, url)
2271 assert mobj is not None
2273 dlNewest = not mobj.group('episode')
2275 epTitle = mobj.group('showname')
2277 epTitle = mobj.group('episode')
2279 req = urllib2.Request(url)
2280 self.report_extraction(epTitle)
2282 htmlHandle = urllib2.urlopen(req)
2283 html = htmlHandle.read()
2284 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2285 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2288 url = htmlHandle.geturl()
2289 mobj = re.match(self._VALID_URL, url)
2291 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2293 if mobj.group('episode') == '':
2294 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2296 epTitle = mobj.group('episode')
2298 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2299 if len(mMovieParams) == 0:
2300 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2303 playerUrl_raw = mMovieParams[0][0]
2304 self.report_player_url(epTitle)
2306 urlHandle = urllib2.urlopen(playerUrl_raw)
2307 playerUrl = urlHandle.geturl()
2308 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2309 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2312 uri = mMovieParams[0][1]
2313 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2314 self.report_index_download(epTitle)
2316 indexXml = urllib2.urlopen(indexUrl).read()
2317 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2318 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2323 idoc = xml.etree.ElementTree.fromstring(indexXml)
2324 itemEls = idoc.findall('.//item')
2325 for itemEl in itemEls:
2326 mediaId = itemEl.findall('./guid')[0].text
2327 shortMediaId = mediaId.split(':')[-1]
2328 showId = mediaId.split(':')[-2].replace('.com', '')
2329 officialTitle = itemEl.findall('./title')[0].text
2330 officialDate = itemEl.findall('./pubDate')[0].text
2332 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2333 urllib.urlencode({'uri': mediaId}))
2334 configReq = urllib2.Request(configUrl)
2335 self.report_config_download(epTitle)
2337 configXml = urllib2.urlopen(configReq).read()
2338 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2339 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2342 cdoc = xml.etree.ElementTree.fromstring(configXml)
2344 for rendition in cdoc.findall('.//rendition'):
2345 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2349 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2352 # For now, just pick the highest bitrate
2353 format,video_url = turls[-1]
2355 effTitle = showId + u'-' + epTitle
2360 'upload_date': officialDate,
2365 'description': officialTitle,
2366 'player_url': playerUrl
2369 results.append(info)
2374 class EscapistIE(InfoExtractor):
2375 """Information extractor for The Escapist """
2377 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2378 IE_NAME = u'escapist'
2380 def report_extraction(self, showName):
2381 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2383 def report_config_download(self, showName):
2384 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2386 def _real_extract(self, url):
2387 mobj = re.match(self._VALID_URL, url)
2389 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2391 showName = mobj.group('showname')
2392 videoId = mobj.group('episode')
2394 self.report_extraction(showName)
2396 webPage = urllib2.urlopen(url)
2397 webPageBytes = webPage.read()
2398 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2399 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2400 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2401 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2404 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2405 description = unescapeHTML(descMatch.group(1))
2406 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2407 imgUrl = unescapeHTML(imgMatch.group(1))
2408 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2409 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2410 configUrlMatch = re.search('config=(.*)$', playerUrl)
2411 configUrl = urllib2.unquote(configUrlMatch.group(1))
2413 self.report_config_download(showName)
2415 configJSON = urllib2.urlopen(configUrl).read()
2416 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2417 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2420 # Technically, it's JavaScript, not JSON
2421 configJSON = configJSON.replace("'", '"')
2424 config = json.loads(configJSON)
2425 except (ValueError,), err:
2426 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2429 playlist = config['playlist']
2430 videoUrl = playlist[1]['url']
2435 'uploader': showName,
2436 'upload_date': None,
2440 'thumbnail': imgUrl,
2441 'description': description,
2442 'player_url': playerUrl,
2448 class CollegeHumorIE(InfoExtractor):
2449 """Information extractor for collegehumor.com"""
2451 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2452 IE_NAME = u'collegehumor'
2454 def report_webpage(self, video_id):
2455 """Report information extraction."""
2456 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2458 def report_extraction(self, video_id):
2459 """Report information extraction."""
2460 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2462 def _real_extract(self, url):
2463 mobj = re.match(self._VALID_URL, url)
2465 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2467 video_id = mobj.group('videoid')
2469 self.report_webpage(video_id)
2470 request = urllib2.Request(url)
2472 webpage = urllib2.urlopen(request).read()
2473 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2474 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2477 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2479 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2481 internal_video_id = m.group('internalvideoid')
2485 'internal_id': internal_video_id,
2488 self.report_extraction(video_id)
2489 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2491 metaXml = urllib2.urlopen(xmlUrl).read()
2492 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2493 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2496 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2498 videoNode = mdoc.findall('./video')[0]
2499 info['description'] = videoNode.findall('./description')[0].text
2500 info['title'] = videoNode.findall('./caption')[0].text
2501 info['url'] = videoNode.findall('./file')[0].text
2502 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2503 info['ext'] = info['url'].rpartition('.')[2]
2504 info['format'] = info['ext']
2506 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2512 class XVideosIE(InfoExtractor):
2513 """Information extractor for xvideos.com"""
2515 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2516 IE_NAME = u'xvideos'
2518 def report_webpage(self, video_id):
2519 """Report information extraction."""
2520 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2522 def report_extraction(self, video_id):
2523 """Report information extraction."""
2524 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2526 def _real_extract(self, url):
2527 mobj = re.match(self._VALID_URL, url)
2529 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2531 video_id = mobj.group(1).decode('utf-8')
2533 self.report_webpage(video_id)
2535 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2537 webpage = urllib2.urlopen(request).read()
2538 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2539 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2542 self.report_extraction(video_id)
2546 mobj = re.search(r'flv_url=(.+?)&', webpage)
2548 self._downloader.trouble(u'ERROR: unable to extract video url')
2550 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2554 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2556 self._downloader.trouble(u'ERROR: unable to extract video title')
2558 video_title = mobj.group(1).decode('utf-8')
2561 # Extract video thumbnail
2562 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2564 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2566 video_thumbnail = mobj.group(0).decode('utf-8')
2572 'upload_date': None,
2573 'title': video_title,
2576 'thumbnail': video_thumbnail,
2577 'description': None,
2584 class SoundcloudIE(InfoExtractor):
2585 """Information extractor for soundcloud.com
2586 To access the media, the uid of the song and a stream token
2587 must be extracted from the page source and the script must make
2588 a request to media.soundcloud.com/crossdomain.xml. Then
2589 the media can be grabbed by requesting from an url composed
2590 of the stream token and uid
2593 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2594 IE_NAME = u'soundcloud'
2596 def __init__(self, downloader=None):
2597 InfoExtractor.__init__(self, downloader)
2599 def report_webpage(self, video_id):
2600 """Report information extraction."""
2601 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2603 def report_extraction(self, video_id):
2604 """Report information extraction."""
2605 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2607 def _real_extract(self, url):
2608 mobj = re.match(self._VALID_URL, url)
2610 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2613 # extract uploader (which is in the url)
2614 uploader = mobj.group(1).decode('utf-8')
2615 # extract simple title (uploader + slug of song title)
2616 slug_title = mobj.group(2).decode('utf-8')
2617 simple_title = uploader + u'-' + slug_title
2619 self.report_webpage('%s/%s' % (uploader, slug_title))
2621 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2623 webpage = urllib2.urlopen(request).read()
2624 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2625 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2628 self.report_extraction('%s/%s' % (uploader, slug_title))
2630 # extract uid and stream token that soundcloud hands out for access
2631 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2633 video_id = mobj.group(1)
2634 stream_token = mobj.group(2)
2636 # extract unsimplified title
2637 mobj = re.search('"title":"(.*?)",', webpage)
2639 title = mobj.group(1).decode('utf-8')
2641 title = simple_title
2643 # construct media url (with uid/token)
2644 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2645 mediaURL = mediaURL % (video_id, stream_token)
2648 description = u'No description available'
2649 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2651 description = mobj.group(1)
2655 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2658 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2659 except Exception, e:
2660 self._downloader.to_stderr(str(e))
2662 # for soundcloud, a request to a cross domain is required for cookies
2663 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2666 'id': video_id.decode('utf-8'),
2668 'uploader': uploader.decode('utf-8'),
2669 'upload_date': upload_date,
2674 'description': description.decode('utf-8')
2678 class InfoQIE(InfoExtractor):
2679 """Information extractor for infoq.com"""
2681 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2684 def report_webpage(self, video_id):
2685 """Report information extraction."""
2686 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2688 def report_extraction(self, video_id):
2689 """Report information extraction."""
2690 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2692 def _real_extract(self, url):
2693 mobj = re.match(self._VALID_URL, url)
2695 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2698 self.report_webpage(url)
2700 request = urllib2.Request(url)
2702 webpage = urllib2.urlopen(request).read()
2703 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2704 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2707 self.report_extraction(url)
2711 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2713 self._downloader.trouble(u'ERROR: unable to extract video url')
2715 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2719 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2721 self._downloader.trouble(u'ERROR: unable to extract video title')
2723 video_title = mobj.group(1).decode('utf-8')
2725 # Extract description
2726 video_description = u'No description available.'
2727 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2728 if mobj is not None:
2729 video_description = mobj.group(1).decode('utf-8')
2731 video_filename = video_url.split('/')[-1]
2732 video_id, extension = video_filename.split('.')
2738 'upload_date': None,
2739 'title': video_title,
2741 'format': extension, # Extension is always(?) mp4, but seems to be flv
2743 'description': video_description,
2749 class MixcloudIE(InfoExtractor):
2750 """Information extractor for www.mixcloud.com"""
2751 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2752 IE_NAME = u'mixcloud'
2754 def __init__(self, downloader=None):
2755 InfoExtractor.__init__(self, downloader)
2757 def report_download_json(self, file_id):
2758 """Report JSON download."""
2759 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2761 def report_extraction(self, file_id):
2762 """Report information extraction."""
2763 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2765 def get_urls(self, jsonData, fmt, bitrate='best'):
2766 """Get urls from 'audio_formats' section in json"""
2769 bitrate_list = jsonData[fmt]
2770 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2771 bitrate = max(bitrate_list) # select highest
2773 url_list = jsonData[fmt][bitrate]
2774 except TypeError: # we have no bitrate info.
2775 url_list = jsonData[fmt]
2778 def check_urls(self, url_list):
2779 """Returns 1st active url from list"""
2780 for url in url_list:
2782 urllib2.urlopen(url)
2784 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2789 def _print_formats(self, formats):
2790 print 'Available formats:'
2791 for fmt in formats.keys():
2792 for b in formats[fmt]:
2794 ext = formats[fmt][b][0]
2795 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2796 except TypeError: # we have no bitrate info
2797 ext = formats[fmt][0]
2798 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2801 def _real_extract(self, url):
2802 mobj = re.match(self._VALID_URL, url)
2804 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2806 # extract uploader & filename from url
2807 uploader = mobj.group(1).decode('utf-8')
2808 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2810 # construct API request
2811 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2812 # retrieve .json file with links to files
2813 request = urllib2.Request(file_url)
2815 self.report_download_json(file_url)
2816 jsonData = urllib2.urlopen(request).read()
2817 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2818 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2822 json_data = json.loads(jsonData)
2823 player_url = json_data['player_swf_url']
2824 formats = dict(json_data['audio_formats'])
2826 req_format = self._downloader.params.get('format', None)
2829 if self._downloader.params.get('listformats', None):
2830 self._print_formats(formats)
2833 if req_format is None or req_format == 'best':
2834 for format_param in formats.keys():
2835 url_list = self.get_urls(formats, format_param)
2837 file_url = self.check_urls(url_list)
2838 if file_url is not None:
2841 if req_format not in formats.keys():
2842 self._downloader.trouble(u'ERROR: format is not available')
2845 url_list = self.get_urls(formats, req_format)
2846 file_url = self.check_urls(url_list)
2847 format_param = req_format
2850 'id': file_id.decode('utf-8'),
2851 'url': file_url.decode('utf-8'),
2852 'uploader': uploader.decode('utf-8'),
2853 'upload_date': u'NA',
2854 'title': json_data['name'],
2855 'ext': file_url.split('.')[-1].decode('utf-8'),
2856 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2857 'thumbnail': json_data['thumbnail_url'],
2858 'description': json_data['description'],
2859 'player_url': player_url.decode('utf-8'),
2862 class StanfordOpenClassroomIE(InfoExtractor):
2863 """Information extractor for Stanford's Open ClassRoom"""
2865 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2866 IE_NAME = u'stanfordoc'
2868 def report_download_webpage(self, objid):
2869 """Report information extraction."""
2870 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2872 def report_extraction(self, video_id):
2873 """Report information extraction."""
2874 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2876 def _real_extract(self, url):
2877 mobj = re.match(self._VALID_URL, url)
2879 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2882 if mobj.group('course') and mobj.group('video'): # A specific video
2883 course = mobj.group('course')
2884 video = mobj.group('video')
2886 'id': course + '_' + video,
2889 self.report_extraction(info['id'])
2890 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2891 xmlUrl = baseUrl + video + '.xml'
2893 metaXml = urllib2.urlopen(xmlUrl).read()
2894 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2895 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2897 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2899 info['title'] = mdoc.findall('./title')[0].text
2900 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2902 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2904 info['ext'] = info['url'].rpartition('.')[2]
2905 info['format'] = info['ext']
2907 elif mobj.group('course'): # A course page
2908 course = mobj.group('course')
2914 self.report_download_webpage(info['id'])
2916 coursepage = urllib2.urlopen(url).read()
2917 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2918 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2921 m = re.search('<h1>([^<]+)</h1>', coursepage)
2923 info['title'] = unescapeHTML(m.group(1))
2925 info['title'] = info['id']
2927 m = re.search('<description>([^<]+)</description>', coursepage)
2929 info['description'] = unescapeHTML(m.group(1))
2931 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2934 'type': 'reference',
2935 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2939 for entry in info['list']:
2940 assert entry['type'] == 'reference'
2941 results += self.extract(entry['url'])
2946 'id': 'Stanford OpenClassroom',
2950 self.report_download_webpage(info['id'])
2951 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2953 rootpage = urllib2.urlopen(rootURL).read()
2954 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2955 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2958 info['title'] = info['id']
2960 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2963 'type': 'reference',
2964 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2969 for entry in info['list']:
2970 assert entry['type'] == 'reference'
2971 results += self.extract(entry['url'])
2974 class MTVIE(InfoExtractor):
2975 """Information extractor for MTV.com"""
2977 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2980 def report_webpage(self, video_id):
2981 """Report information extraction."""
2982 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2984 def report_extraction(self, video_id):
2985 """Report information extraction."""
2986 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2988 def _real_extract(self, url):
2989 mobj = re.match(self._VALID_URL, url)
2991 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2993 if not mobj.group('proto'):
2994 url = 'http://' + url
2995 video_id = mobj.group('videoid')
2996 self.report_webpage(video_id)
2998 request = urllib2.Request(url)
3000 webpage = urllib2.urlopen(request).read()
3001 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3002 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3005 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3007 self._downloader.trouble(u'ERROR: unable to extract song name')
3009 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3010 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3012 self._downloader.trouble(u'ERROR: unable to extract performer')
3014 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3015 video_title = performer + ' - ' + song_name
3017 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3019 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3021 mtvn_uri = mobj.group(1)
3023 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3025 self._downloader.trouble(u'ERROR: unable to extract content id')
3027 content_id = mobj.group(1)
3029 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3030 self.report_extraction(video_id)
3031 request = urllib2.Request(videogen_url)
3033 metadataXml = urllib2.urlopen(request).read()
3034 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3035 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3038 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3039 renditions = mdoc.findall('.//rendition')
3041 # For now, always pick the highest quality.
3042 rendition = renditions[-1]
3045 _,_,ext = rendition.attrib['type'].partition('/')
3046 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3047 video_url = rendition.find('./src').text
3049 self._downloader.trouble('Invalid rendition field.')
3055 'uploader': performer,
3056 'title': video_title,
3064 class YoukuIE(InfoExtractor):
3066 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3069 def __init__(self, downloader=None):
3070 InfoExtractor.__init__(self, downloader)
3072 def report_download_webpage(self, file_id):
3073 """Report webpage download."""
3074 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3076 def report_extraction(self, file_id):
3077 """Report information extraction."""
3078 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3081 nowTime = int(time.time() * 1000)
3082 random1 = random.randint(1000,1998)
3083 random2 = random.randint(1000,9999)
3085 return "%d%d%d" %(nowTime,random1,random2)
3087 def _get_file_ID_mix_string(self, seed):
3089 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3091 for i in range(len(source)):
3092 seed = (seed * 211 + 30031 ) % 65536
3093 index = math.floor(seed / 65536 * len(source) )
3094 mixed.append(source[int(index)])
3095 source.remove(source[int(index)])
3096 #return ''.join(mixed)
3099 def _get_file_id(self, fileId, seed):
3100 mixed = self._get_file_ID_mix_string(seed)
3101 ids = fileId.split('*')
3105 realId.append(mixed[int(ch)])
3106 return ''.join(realId)
3108 def _real_extract(self, url):
3109 mobj = re.match(self._VALID_URL, url)
3111 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3113 video_id = mobj.group('ID')
3115 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3117 request = urllib2.Request(info_url, None, std_headers)
3119 self.report_download_webpage(video_id)
3120 jsondata = urllib2.urlopen(request).read()
3121 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3122 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3125 self.report_extraction(video_id)
3127 config = json.loads(jsondata)
3129 video_title = config['data'][0]['title']
3130 seed = config['data'][0]['seed']
3132 format = self._downloader.params.get('format', None)
3133 supported_format = config['data'][0]['streamfileids'].keys()
3135 if format is None or format == 'best':
3136 if 'hd2' in supported_format:
3141 elif format == 'worst':
3149 fileid = config['data'][0]['streamfileids'][format]
3150 seg_number = len(config['data'][0]['segs'][format])
3153 for i in xrange(seg_number):
3154 keys.append(config['data'][0]['segs'][format][i]['k'])
3157 #youku only could be viewed from mainland china
3159 self._downloader.trouble(u'ERROR: unable to extract info section')
3163 sid = self._gen_sid()
3164 fileid = self._get_file_id(fileid, seed)
3166 #column 8,9 of fileid represent the segment number
3167 #fileid[7:9] should be changed
3168 for index, key in enumerate(keys):
3170 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3171 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3174 'id': '%s_part%02d' % (video_id, index),
3175 'url': download_url,
3177 'title': video_title,
3181 files_info.append(info)
3186 class XNXXIE(InfoExtractor):
3187 """Information extractor for xnxx.com"""
3189 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3191 VIDEO_URL_RE = r'flv_url=(.*?)&'
3192 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3193 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3195 def report_webpage(self, video_id):
3196 """Report information extraction"""
3197 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3199 def report_extraction(self, video_id):
3200 """Report information extraction"""
3201 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3203 def _real_extract(self, url):
3204 mobj = re.match(self._VALID_URL, url)
3206 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3208 video_id = mobj.group(1).decode('utf-8')
3210 self.report_webpage(video_id)
3212 # Get webpage content
3214 webpage = urllib2.urlopen(url).read()
3215 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3216 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3219 result = re.search(self.VIDEO_URL_RE, webpage)
3221 self._downloader.trouble(u'ERROR: unable to extract video url')
3223 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3225 result = re.search(self.VIDEO_TITLE_RE, webpage)
3227 self._downloader.trouble(u'ERROR: unable to extract video title')
3229 video_title = result.group(1).decode('utf-8')
3231 result = re.search(self.VIDEO_THUMB_RE, webpage)
3233 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3235 video_thumbnail = result.group(1).decode('utf-8')
3237 info = {'id': video_id,
3240 'upload_date': None,
3241 'title': video_title,
3244 'thumbnail': video_thumbnail,
3245 'description': None,
3251 class GooglePlusIE(InfoExtractor):
3252 """Information extractor for plus.google.com."""
3254 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3255 IE_NAME = u'plus.google'
3257 def __init__(self, downloader=None):
3258 InfoExtractor.__init__(self, downloader)
3260 def report_extract_entry(self, url):
3261 """Report downloading extry"""
3262 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3264 def report_date(self, upload_date):
3265 """Report downloading extry"""
3266 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3268 def report_uploader(self, uploader):
3269 """Report downloading extry"""
3270 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3272 def report_title(self, video_title):
3273 """Report downloading extry"""
3274 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3276 def report_extract_vid_page(self, video_page):
3277 """Report information extraction."""
3278 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3280 def _real_extract(self, url):
3281 # Extract id from URL
3282 mobj = re.match(self._VALID_URL, url)
3284 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3287 post_url = mobj.group(0)
3288 video_id = mobj.group(2)
3290 video_extension = 'flv'
3292 # Step 1, Retrieve post webpage to extract further information
3293 self.report_extract_entry(post_url)
3294 request = urllib2.Request(post_url)
3296 webpage = urllib2.urlopen(request).read()
3297 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3298 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % str(err))
3301 # Extract update date
3303 pattern = 'title="Timestamp">(.*?)</a>'
3304 mobj = re.search(pattern, webpage)
3306 upload_date = mobj.group(1)
3307 # Convert timestring to a format suitable for filename
3308 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3309 upload_date = upload_date.strftime('%Y%m%d')
3310 self.report_date(upload_date)
3314 pattern = r'rel\="author".*?>(.*?)</a>'
3315 mobj = re.search(pattern, webpage)
3317 uploader = mobj.group(1)
3318 self.report_uploader(uploader)
3321 # Get the first line for title
3323 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3324 mobj = re.search(pattern, webpage)
3326 video_title = mobj.group(1)
3327 self.report_title(video_title)
3329 # Step 2, Stimulate clicking the image box to launch video
3330 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3331 mobj = re.search(pattern, webpage)
3333 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3335 video_page = mobj.group(1)
3336 request = urllib2.Request(video_page)
3338 webpage = urllib2.urlopen(request).read()
3339 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3340 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3342 self.report_extract_vid_page(video_page)
3345 # Extract video links on video page
3346 """Extract video links of all sizes"""
3347 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3348 mobj = re.findall(pattern, webpage)
3350 self._downloader.trouble(u'ERROR: unable to extract video links')
3352 # Sort in resolution
3353 links = sorted(mobj)
3355 # Choose the lowest of the sort, i.e. highest resolution
3356 video_url = links[-1]
3357 # Only get the url. The resolution part in the tuple has no use anymore
3358 video_url = video_url[-1]
3359 # Treat escaped \u0026 style hex
3360 video_url = unicode(video_url, "unicode_escape")
3364 'id': video_id.decode('utf-8'),
3366 'uploader': uploader.decode('utf-8'),
3367 'upload_date': upload_date.decode('utf-8'),
3368 'title': video_title.decode('utf-8'),
3369 'ext': video_extension.decode('utf-8'),
3376 class YouPornIE(InfoExtractor):
3377 """Information extractor for youporn.com."""
3379 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3380 IE_NAME = u'youporn'
3381 VIDEO_TITLE_RE = r'videoTitleArea">(?P<title>.*)</h1>'
3382 VIDEO_DATE_RE = r'Date:</b>(?P<date>.*)</li>'
3383 VIDEO_UPLOADER_RE = r'Submitted:</b>(?P<uploader>.*)</li>'
3384 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3385 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3387 def __init__(self, downloader=None):
3388 InfoExtractor.__init__(self, downloader)
3390 def report_id(self, video_id):
3391 """Report finding video ID"""
3392 self._downloader.to_screen(u'[youporn] Video ID: %s' % video_id)
3394 def report_webpage(self, url):
3395 """Report downloading page"""
3396 self._downloader.to_screen(u'[youporn] Downloaded page: %s' % url)
3398 def report_title(self, video_title):
3399 """Report dfinding title"""
3400 self._downloader.to_screen(u'[youporn] Title: %s' % video_title)
3402 def report_uploader(self, uploader):
3403 """Report dfinding title"""
3404 self._downloader.to_screen(u'[youporn] Uploader: %s' % uploader)
3406 def report_upload_date(self, video_date):
3407 """Report finding date"""
3408 self._downloader.to_screen(u'[youporn] Date: %s' % video_date)
3410 def _print_formats(self, formats):
3411 """Print all available formats"""
3412 print 'Available formats:'
3413 print u'ext\t\tformat'
3414 print u'---------------------------------'
3415 for format in formats:
3416 print u'%s\t\t%s' % (format['ext'], format['format'])
3418 def _specific(self, req_format, formats):
3420 if(x["format"]==req_format):
3425 def _real_extract(self, url):
3426 mobj = re.match(self._VALID_URL, url)
3428 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3431 video_id = mobj.group('videoid').decode('utf-8')
3432 self.report_id(video_id)
3434 # Get webpage content
3436 webpage = urllib2.urlopen(url).read()
3437 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3438 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3440 self.report_webpage(url)
3443 result = re.search(self.VIDEO_TITLE_RE, webpage)
3445 self._downloader.trouble(u'ERROR: unable to extract video title')
3447 video_title = result.group('title').decode('utf-8').strip()
3448 self.report_title(video_title)
3450 # Get the video date
3451 result = re.search(self.VIDEO_DATE_RE, webpage)
3453 self._downloader.trouble(u'ERROR: unable to extract video date')
3455 upload_date = result.group('date').decode('utf-8').strip()
3456 self.report_upload_date(upload_date)
3458 # Get the video uploader
3459 result = re.search(self.VIDEO_UPLOADER_RE, webpage)
3461 self._downloader.trouble(u'ERROR: unable to extract uploader')
3463 video_uploader = result.group('uploader').decode('utf-8').strip()
3464 video_uploader = clean_html( video_uploader )
3465 self.report_uploader(video_uploader)
3467 # Get all of the formats available
3468 result = re.search(self.DOWNLOAD_LIST_RE, webpage)
3470 self._downloader.trouble(u'ERROR: unable to extract download list')
3472 download_list_html = result.group('download_list').decode('utf-8').strip()
3474 # Get all of the links from the page
3475 links = re.findall(self.LINK_RE, download_list_html)
3476 if(len(links) == 0):
3477 self._downloader.trouble(u'ERROR: no known formats available for video')
3480 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3485 # A link looks like this:
3486 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3487 # A path looks like this:
3488 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3489 video_url = unescapeHTML( link.decode('utf-8') )
3490 path = urlparse( video_url ).path
3491 extension = os.path.splitext( path )[1][1:]
3492 format = path.split('/')[4].split('_')[:2]
3495 format = "-".join( format )
3496 title = u'%s-%s-%s' % (video_title, size, bitrate)
3501 'uploader': video_uploader,
3502 'upload_date': upload_date,
3507 'description': None,
3511 if self._downloader.params.get('listformats', None):
3512 self._print_formats(results)
3515 req_format = self._downloader.params.get('format', None)
3516 #format_limit = self._downloader.params.get('format_limit', None)
3517 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3520 if req_format is None or req_format == 'best':
3522 elif req_format == 'worst':
3523 return [formats[-1]]
3524 elif req_format in ('-1', 'all'):
3527 format = self._specific( req_format, formats )
3529 self._downloader.trouble(u'ERROR: requested format not available')
3536 class PornotubeIE(InfoExtractor):
3537 """Information extractor for pornotube.com."""
3539 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3540 IE_NAME = u'pornotube'
3541 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3542 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3545 def __init__(self, downloader=None):
3546 InfoExtractor.__init__(self, downloader)
3548 def report_extract_entry(self, url):
3549 """Report downloading extry"""
3550 self._downloader.to_screen(u'[pornotube] Downloading entry: %s' % url.decode('utf-8'))
3552 def report_date(self, upload_date):
3553 """Report finding uploaded date"""
3554 self._downloader.to_screen(u'[pornotube] Entry date: %s' % upload_date)
3556 def report_webpage(self, url):
3557 """Report downloading page"""
3558 self._downloader.to_screen(u'[pornotube] Downloaded page: %s' % url)
3560 def report_title(self, video_title):
3561 """Report downloading extry"""
3562 self._downloader.to_screen(u'[pornotube] Title: %s' % video_title.decode('utf-8'))
3564 def _real_extract(self, url):
3565 mobj = re.match(self._VALID_URL, url)
3567 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3570 video_id = mobj.group('videoid').decode('utf-8')
3571 video_title = mobj.group('title').decode('utf-8')
3572 self.report_title(video_title);
3574 # Get webpage content
3576 webpage = urllib2.urlopen(url).read()
3577 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3578 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3580 self.report_webpage(url)
3583 result = re.search(self.VIDEO_URL_RE, webpage)
3585 self._downloader.trouble(u'ERROR: unable to extract video url')
3587 video_url = urllib.unquote(result.group('url').decode('utf-8'))
3588 self.report_extract_entry(video_url)
3590 #Get the uploaded date
3591 result = re.search(self.VIDEO_UPLOADED_RE, webpage)
3593 self._downloader.trouble(u'ERROR: unable to extract video title')
3595 upload_date = result.group('date').decode('utf-8')
3596 self.report_date(upload_date);
3599 info = {'id': video_id,
3602 'upload_date': upload_date,
3603 'title': video_title,
3607 'description': None,