2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
13 import xml.etree.ElementTree
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 title: Video title, unescaped.
36 ext: Video filename extension.
37 uploader: Full name of the video uploader.
38 upload_date: Video upload date (YYYYMMDD).
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader_id: Nickname or id of the video uploader.
46 player_url: SWF Player URL (used for rtmpdump).
47 subtitles: The .srt file contents.
48 urlhandle: [internal] The urlHandle to be used to download the file,
49 like returned by urllib.request.urlopen
51 The fields should all be Unicode strings.
53 Subclasses of this one should re-define the _real_initialize() and
54 _real_extract() methods and define a _VALID_URL regexp.
55 Probably, they should also be added to the list of extractors.
57 _real_extract() must return a *list* of information dictionaries as
60 Finally, the _WORKING attribute should be set to False for broken IEs
61 in order to warn the users and skip the tests.
68 def __init__(self, downloader=None):
69 """Constructor. Receives an optional downloader."""
71 self.set_downloader(downloader)
73 def suitable(self, url):
74 """Receives a URL and returns True if suitable for this IE."""
75 return re.match(self._VALID_URL, url) is not None
78 """Getter method for _WORKING."""
82 """Initializes an instance (authentication, etc)."""
84 self._real_initialize()
87 def extract(self, url):
88 """Extracts URL information and returns it in list of dicts."""
90 return self._real_extract(url)
92 def set_downloader(self, downloader):
93 """Sets the downloader for this IE."""
94 self._downloader = downloader
96 def _real_initialize(self):
97 """Real initialization process. Redefine in subclasses."""
100 def _real_extract(self, url):
101 """Real extraction process. Redefine in subclasses."""
106 return type(self).__name__[:-2]
108 class YoutubeIE(InfoExtractor):
109 """Information extractor for youtube.com."""
113 (?:https?://)? # http(s):// (optional)
114 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
115 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
116 (?:.*?\#/)? # handle anchor (#/) redirect urls
117 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
118 (?: # the various things that can precede the ID:
119 (?:(?:v|embed|e)/) # v/ or embed/ or e/
120 |(?: # or the v= param in all its forms
121 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
122 (?:\?|\#!?) # the params delimiter ? or # or #!
123 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
126 )? # optional -> youtube.com/xxxx is OK
127 )? # all until now is optional -> you can pass the naked ID
128 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
129 (?(1).+)? # if we found the ID, everything can follow
131 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
132 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
133 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
134 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
135 _NETRC_MACHINE = 'youtube'
136 # Listed in order of quality
137 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
138 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
139 _video_extensions = {
145 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
151 _video_dimensions = {
169 def suitable(self, url):
170 """Receives a URL and returns True if suitable for this IE."""
171 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
173 def report_lang(self):
174 """Report attempt to set language."""
175 self._downloader.to_screen(u'[youtube] Setting language')
177 def report_login(self):
178 """Report attempt to log in."""
179 self._downloader.to_screen(u'[youtube] Logging in')
181 def report_age_confirmation(self):
182 """Report attempt to confirm age."""
183 self._downloader.to_screen(u'[youtube] Confirming age')
185 def report_video_webpage_download(self, video_id):
186 """Report attempt to download video webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
189 def report_video_info_webpage_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
193 def report_video_subtitles_download(self, video_id):
194 """Report attempt to download video info webpage."""
195 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
197 def report_information_extraction(self, video_id):
198 """Report attempt to extract video information."""
199 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
201 def report_unavailable_format(self, video_id, format):
202 """Report extracted video URL."""
203 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
205 def report_rtmp_download(self):
206 """Indicate the download will use the RTMP protocol."""
207 self._downloader.to_screen(u'[youtube] RTMP download detected')
209 def _closed_captions_xml_to_srt(self, xml_string):
211 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
212 # TODO parse xml instead of regex
213 for n, (start, dur_tag, dur, caption) in enumerate(texts):
214 if not dur: dur = '4'
216 end = start + float(dur)
217 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
218 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
219 caption = unescapeHTML(caption)
220 caption = unescapeHTML(caption) # double cycle, intentional
221 srt += str(n+1) + '\n'
222 srt += start + ' --> ' + end + '\n'
223 srt += caption + '\n\n'
226 def _extract_subtitles(self, video_id):
227 self.report_video_subtitles_download(video_id)
228 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
230 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
231 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
232 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
233 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
234 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
235 if not srt_lang_list:
236 return (u'WARNING: video has no closed captions', None)
237 if self._downloader.params.get('subtitleslang', False):
238 srt_lang = self._downloader.params.get('subtitleslang')
239 elif 'en' in srt_lang_list:
242 srt_lang = list(srt_lang_list.keys())[0]
243 if not srt_lang in srt_lang_list:
244 return (u'WARNING: no closed captions found in the specified language', None)
245 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
247 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
248 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
249 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
251 return (u'WARNING: unable to download video subtitles', None)
252 return (None, self._closed_captions_xml_to_srt(srt_xml))
254 def _print_formats(self, formats):
255 print('Available formats:')
257 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
259 def _real_initialize(self):
260 if self._downloader is None:
265 downloader_params = self._downloader.params
267 # Attempt to use provided username and password or .netrc data
268 if downloader_params.get('username', None) is not None:
269 username = downloader_params['username']
270 password = downloader_params['password']
271 elif downloader_params.get('usenetrc', False):
273 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
278 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
279 except (IOError, netrc.NetrcParseError) as err:
280 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
284 request = compat_urllib_request.Request(self._LANG_URL)
287 compat_urllib_request.urlopen(request).read()
288 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
289 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
292 # No authentication to be performed
298 'current_form': 'loginForm',
300 'action_login': 'Log In',
301 'username': username,
302 'password': password,
304 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
307 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
308 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
309 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
311 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
312 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
318 'action_confirm': 'Confirm',
320 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
322 self.report_age_confirmation()
323 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
324 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
325 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
328 def _extract_id(self, url):
329 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
331 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
333 video_id = mobj.group(2)
336 def _real_extract(self, url):
337 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
338 mobj = re.search(self._NEXT_URL_RE, url)
340 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
341 video_id = self._extract_id(url)
344 self.report_video_webpage_download(video_id)
345 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
346 request = compat_urllib_request.Request(url)
348 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
349 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
350 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
353 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
355 # Attempt to extract SWF player URL
356 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
358 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
363 self.report_video_info_webpage_download(video_id)
364 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
365 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
366 % (video_id, el_type))
367 request = compat_urllib_request.Request(video_info_url)
369 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
370 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
371 video_info = compat_parse_qs(video_info_webpage)
372 if 'token' in video_info:
374 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
375 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
377 if 'token' not in video_info:
378 if 'reason' in video_info:
379 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
381 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
384 # Check for "rental" videos
385 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
386 self._downloader.trouble(u'ERROR: "rental" videos not supported')
389 # Start extracting information
390 self.report_information_extraction(video_id)
393 if 'author' not in video_info:
394 self._downloader.trouble(u'ERROR: unable to extract uploader name')
396 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
399 video_uploader_id = None
400 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
402 video_uploader_id = mobj.group(1)
404 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
407 if 'title' not in video_info:
408 self._downloader.trouble(u'ERROR: unable to extract video title')
410 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
413 if 'thumbnail_url' not in video_info:
414 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
416 else: # don't panic if we can't find it
417 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
421 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
423 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
424 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
425 for expression in format_expressions:
427 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
432 video_description = get_element_by_id("eow-description", video_webpage)
433 if video_description:
434 video_description = clean_html(video_description)
436 video_description = ''
439 video_subtitles = None
440 if self._downloader.params.get('writesubtitles', False):
441 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
443 self._downloader.trouble(srt_error)
445 if 'length_seconds' not in video_info:
446 self._downloader.trouble(u'WARNING: unable to extract video duration')
449 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
452 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
454 # Decide which formats to download
455 req_format = self._downloader.params.get('format', None)
457 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
458 self.report_rtmp_download()
459 video_url_list = [(None, video_info['conn'][0])]
460 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
461 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
462 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
463 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
464 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
466 format_limit = self._downloader.params.get('format_limit', None)
467 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
468 if format_limit is not None and format_limit in available_formats:
469 format_list = available_formats[available_formats.index(format_limit):]
471 format_list = available_formats
472 existing_formats = [x for x in format_list if x in url_map]
473 if len(existing_formats) == 0:
474 self._downloader.trouble(u'ERROR: no known formats available for video')
476 if self._downloader.params.get('listformats', None):
477 self._print_formats(existing_formats)
479 if req_format is None or req_format == 'best':
480 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
481 elif req_format == 'worst':
482 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
483 elif req_format in ('-1', 'all'):
484 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
486 # Specific formats. We pick the first in a slash-delimeted sequence.
487 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
488 req_formats = req_format.split('/')
489 video_url_list = None
490 for rf in req_formats:
492 video_url_list = [(rf, url_map[rf])]
494 if video_url_list is None:
495 self._downloader.trouble(u'ERROR: requested format not available')
498 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
502 for format_param, video_real_url in video_url_list:
504 video_extension = self._video_extensions.get(format_param, 'flv')
506 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
507 self._video_dimensions.get(format_param, '???'))
511 'url': video_real_url,
512 'uploader': video_uploader,
513 'uploader_id': video_uploader_id,
514 'upload_date': upload_date,
515 'title': video_title,
516 'ext': video_extension,
517 'format': video_format,
518 'thumbnail': video_thumbnail,
519 'description': video_description,
520 'player_url': player_url,
521 'subtitles': video_subtitles,
522 'duration': video_duration
527 class MetacafeIE(InfoExtractor):
528 """Information Extractor for metacafe.com."""
530 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
531 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
532 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
533 IE_NAME = u'metacafe'
535 def __init__(self, downloader=None):
536 InfoExtractor.__init__(self, downloader)
538 def report_disclaimer(self):
539 """Report disclaimer retrieval."""
540 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
542 def report_age_confirmation(self):
543 """Report attempt to confirm age."""
544 self._downloader.to_screen(u'[metacafe] Confirming age')
546 def report_download_webpage(self, video_id):
547 """Report webpage download."""
548 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
550 def report_extraction(self, video_id):
551 """Report information extraction."""
552 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
554 def _real_initialize(self):
555 # Retrieve disclaimer
556 request = compat_urllib_request.Request(self._DISCLAIMER)
558 self.report_disclaimer()
559 disclaimer = compat_urllib_request.urlopen(request).read()
560 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
561 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
567 'submit': "Continue - I'm over 18",
569 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
571 self.report_age_confirmation()
572 disclaimer = compat_urllib_request.urlopen(request).read()
573 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
574 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
577 def _real_extract(self, url):
578 # Extract id and simplified title from URL
579 mobj = re.match(self._VALID_URL, url)
581 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
584 video_id = mobj.group(1)
586 # Check if video comes from YouTube
587 mobj2 = re.match(r'^yt-(.*)$', video_id)
588 if mobj2 is not None:
589 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
592 # Retrieve video webpage to extract further information
593 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
595 self.report_download_webpage(video_id)
596 webpage = compat_urllib_request.urlopen(request).read()
597 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
598 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
601 # Extract URL, uploader and title from webpage
602 self.report_extraction(video_id)
603 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
605 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
606 video_extension = mediaURL[-3:]
608 # Extract gdaKey if available
609 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
613 gdaKey = mobj.group(1)
614 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
616 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
618 self._downloader.trouble(u'ERROR: unable to extract media URL')
620 vardict = compat_parse_qs(mobj.group(1))
621 if 'mediaData' not in vardict:
622 self._downloader.trouble(u'ERROR: unable to extract media URL')
624 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
626 self._downloader.trouble(u'ERROR: unable to extract media URL')
628 mediaURL = mobj.group(1).replace('\\/', '/')
629 video_extension = mediaURL[-3:]
630 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
632 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
634 self._downloader.trouble(u'ERROR: unable to extract title')
636 video_title = mobj.group(1).decode('utf-8')
638 mobj = re.search(r'submitter=(.*?);', webpage)
640 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
642 video_uploader = mobj.group(1)
645 'id': video_id.decode('utf-8'),
646 'url': video_url.decode('utf-8'),
647 'uploader': video_uploader.decode('utf-8'),
649 'title': video_title,
650 'ext': video_extension.decode('utf-8'),
654 class DailymotionIE(InfoExtractor):
655 """Information Extractor for Dailymotion"""
657 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
658 IE_NAME = u'dailymotion'
660 def __init__(self, downloader=None):
661 InfoExtractor.__init__(self, downloader)
663 def report_download_webpage(self, video_id):
664 """Report webpage download."""
665 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
667 def report_extraction(self, video_id):
668 """Report information extraction."""
669 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
671 def _real_extract(self, url):
672 # Extract id and simplified title from URL
673 mobj = re.match(self._VALID_URL, url)
675 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
678 video_id = mobj.group(1).split('_')[0].split('?')[0]
680 video_extension = 'mp4'
682 # Retrieve video webpage to extract further information
683 request = compat_urllib_request.Request(url)
684 request.add_header('Cookie', 'family_filter=off')
686 self.report_download_webpage(video_id)
687 webpage_bytes = compat_urllib_request.urlopen(request).read()
688 webpage = webpage_bytes.decode('utf-8')
689 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
690 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
693 # Extract URL, uploader and title from webpage
694 self.report_extraction(video_id)
695 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
697 self._downloader.trouble(u'ERROR: unable to extract media URL')
699 flashvars = compat_urllib_parse.unquote(mobj.group(1))
701 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
704 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
707 self._downloader.trouble(u'ERROR: unable to extract video URL')
710 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
712 self._downloader.trouble(u'ERROR: unable to extract video URL')
715 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
717 # TODO: support choosing qualities
719 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
721 self._downloader.trouble(u'ERROR: unable to extract title')
723 video_title = unescapeHTML(mobj.group('title'))
725 video_uploader = None
726 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
728 # lookin for official user
729 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
730 if mobj_official is None:
731 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
733 video_uploader = mobj_official.group(1)
735 video_uploader = mobj.group(1)
737 video_upload_date = None
738 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
740 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
745 'uploader': video_uploader,
746 'upload_date': video_upload_date,
747 'title': video_title,
748 'ext': video_extension,
752 class PhotobucketIE(InfoExtractor):
753 """Information extractor for photobucket.com."""
755 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
756 IE_NAME = u'photobucket'
758 def __init__(self, downloader=None):
759 InfoExtractor.__init__(self, downloader)
761 def report_download_webpage(self, video_id):
762 """Report webpage download."""
763 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
765 def report_extraction(self, video_id):
766 """Report information extraction."""
767 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
769 def _real_extract(self, url):
770 # Extract id from URL
771 mobj = re.match(self._VALID_URL, url)
773 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
776 video_id = mobj.group(1)
778 video_extension = 'flv'
780 # Retrieve video webpage to extract further information
781 request = compat_urllib_request.Request(url)
783 self.report_download_webpage(video_id)
784 webpage = compat_urllib_request.urlopen(request).read()
785 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
786 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
789 # Extract URL, uploader, and title from webpage
790 self.report_extraction(video_id)
791 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
793 self._downloader.trouble(u'ERROR: unable to extract media URL')
795 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
799 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
801 self._downloader.trouble(u'ERROR: unable to extract title')
803 video_title = mobj.group(1).decode('utf-8')
805 video_uploader = mobj.group(2).decode('utf-8')
808 'id': video_id.decode('utf-8'),
809 'url': video_url.decode('utf-8'),
810 'uploader': video_uploader,
812 'title': video_title,
813 'ext': video_extension.decode('utf-8'),
817 class YahooIE(InfoExtractor):
818 """Information extractor for video.yahoo.com."""
821 # _VALID_URL matches all Yahoo! Video URLs
822 # _VPAGE_URL matches only the extractable '/watch/' URLs
823 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
824 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
825 IE_NAME = u'video.yahoo'
827 def __init__(self, downloader=None):
828 InfoExtractor.__init__(self, downloader)
830 def report_download_webpage(self, video_id):
831 """Report webpage download."""
832 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
834 def report_extraction(self, video_id):
835 """Report information extraction."""
836 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
838 def _real_extract(self, url, new_video=True):
839 # Extract ID from URL
840 mobj = re.match(self._VALID_URL, url)
842 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
845 video_id = mobj.group(2)
846 video_extension = 'flv'
848 # Rewrite valid but non-extractable URLs as
849 # extractable English language /watch/ URLs
850 if re.match(self._VPAGE_URL, url) is None:
851 request = compat_urllib_request.Request(url)
853 webpage = compat_urllib_request.urlopen(request).read()
854 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
855 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
858 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
860 self._downloader.trouble(u'ERROR: Unable to extract id field')
862 yahoo_id = mobj.group(1)
864 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
866 self._downloader.trouble(u'ERROR: Unable to extract vid field')
868 yahoo_vid = mobj.group(1)
870 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
871 return self._real_extract(url, new_video=False)
873 # Retrieve video webpage to extract further information
874 request = compat_urllib_request.Request(url)
876 self.report_download_webpage(video_id)
877 webpage = compat_urllib_request.urlopen(request).read()
878 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
879 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
882 # Extract uploader and title from webpage
883 self.report_extraction(video_id)
884 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
886 self._downloader.trouble(u'ERROR: unable to extract video title')
888 video_title = mobj.group(1).decode('utf-8')
890 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
892 self._downloader.trouble(u'ERROR: unable to extract video uploader')
894 video_uploader = mobj.group(1).decode('utf-8')
896 # Extract video thumbnail
897 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
899 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
901 video_thumbnail = mobj.group(1).decode('utf-8')
903 # Extract video description
904 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
906 self._downloader.trouble(u'ERROR: unable to extract video description')
908 video_description = mobj.group(1).decode('utf-8')
909 if not video_description:
910 video_description = 'No description available.'
912 # Extract video height and width
913 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
915 self._downloader.trouble(u'ERROR: unable to extract video height')
917 yv_video_height = mobj.group(1)
919 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
921 self._downloader.trouble(u'ERROR: unable to extract video width')
923 yv_video_width = mobj.group(1)
925 # Retrieve video playlist to extract media URL
926 # I'm not completely sure what all these options are, but we
927 # seem to need most of them, otherwise the server sends a 401.
928 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
929 yv_bitrate = '700' # according to Wikipedia this is hard-coded
930 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
931 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
932 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
934 self.report_download_webpage(video_id)
935 webpage = compat_urllib_request.urlopen(request).read()
936 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
937 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
940 # Extract media URL from playlist XML
941 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
943 self._downloader.trouble(u'ERROR: Unable to extract media URL')
945 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
946 video_url = unescapeHTML(video_url)
949 'id': video_id.decode('utf-8'),
951 'uploader': video_uploader,
953 'title': video_title,
954 'ext': video_extension.decode('utf-8'),
955 'thumbnail': video_thumbnail.decode('utf-8'),
956 'description': video_description,
960 class VimeoIE(InfoExtractor):
961 """Information extractor for vimeo.com."""
963 # _VALID_URL matches Vimeo URLs
964 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
967 def __init__(self, downloader=None):
968 InfoExtractor.__init__(self, downloader)
970 def report_download_webpage(self, video_id):
971 """Report webpage download."""
972 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
974 def report_extraction(self, video_id):
975 """Report information extraction."""
976 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
978 def _real_extract(self, url, new_video=True):
979 # Extract ID from URL
980 mobj = re.match(self._VALID_URL, url)
982 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
985 video_id = mobj.group(1)
987 # Retrieve video webpage to extract further information
988 request = compat_urllib_request.Request(url, None, std_headers)
990 self.report_download_webpage(video_id)
991 webpage_bytes = compat_urllib_request.urlopen(request).read()
992 webpage = webpage_bytes.decode('utf-8')
993 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
994 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
997 # Now we begin extracting as much information as we can from what we
998 # retrieved. First we extract the information common to all extractors,
999 # and latter we extract those that are Vimeo specific.
1000 self.report_extraction(video_id)
1002 # Extract the config JSON
1004 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1005 config = json.loads(config)
1007 self._downloader.trouble(u'ERROR: unable to extract info section')
1011 video_title = config["video"]["title"]
1013 # Extract uploader and uploader_id
1014 video_uploader = config["video"]["owner"]["name"]
1015 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1017 # Extract video thumbnail
1018 video_thumbnail = config["video"]["thumbnail"]
1020 # Extract video description
1021 video_description = get_element_by_attribute("itemprop", "description", webpage)
1022 if video_description: video_description = clean_html(video_description)
1023 else: video_description = ''
1025 # Extract upload date
1026 video_upload_date = None
1027 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1028 if mobj is not None:
1029 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1031 # Vimeo specific: extract request signature and timestamp
1032 sig = config['request']['signature']
1033 timestamp = config['request']['timestamp']
1035 # Vimeo specific: extract video codec and quality information
1036 # First consider quality, then codecs, then take everything
1037 # TODO bind to format param
1038 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1039 files = { 'hd': [], 'sd': [], 'other': []}
1040 for codec_name, codec_extension in codecs:
1041 if codec_name in config["video"]["files"]:
1042 if 'hd' in config["video"]["files"][codec_name]:
1043 files['hd'].append((codec_name, codec_extension, 'hd'))
1044 elif 'sd' in config["video"]["files"][codec_name]:
1045 files['sd'].append((codec_name, codec_extension, 'sd'))
1047 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1049 for quality in ('hd', 'sd', 'other'):
1050 if len(files[quality]) > 0:
1051 video_quality = files[quality][0][2]
1052 video_codec = files[quality][0][0]
1053 video_extension = files[quality][0][1]
1054 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1057 self._downloader.trouble(u'ERROR: no known codec found')
1060 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1061 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1066 'uploader': video_uploader,
1067 'uploader_id': video_uploader_id,
1068 'upload_date': video_upload_date,
1069 'title': video_title,
1070 'ext': video_extension,
1071 'thumbnail': video_thumbnail,
1072 'description': video_description,
1076 class ArteTvIE(InfoExtractor):
1077 """arte.tv information extractor."""
1079 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1080 _LIVE_URL = r'index-[0-9]+\.html$'
1082 IE_NAME = u'arte.tv'
1084 def __init__(self, downloader=None):
1085 InfoExtractor.__init__(self, downloader)
1087 def report_download_webpage(self, video_id):
1088 """Report webpage download."""
1089 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1091 def report_extraction(self, video_id):
1092 """Report information extraction."""
1093 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1095 def fetch_webpage(self, url):
1096 self._downloader.increment_downloads()
1097 request = compat_urllib_request.Request(url)
1099 self.report_download_webpage(url)
1100 webpage = compat_urllib_request.urlopen(request).read()
1101 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1102 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1104 except ValueError as err:
1105 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1109 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1110 page = self.fetch_webpage(url)
1111 mobj = re.search(regex, page, regexFlags)
1115 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1118 for (i, key, err) in matchTuples:
1119 if mobj.group(i) is None:
1120 self._downloader.trouble(err)
1123 info[key] = mobj.group(i)
1127 def extractLiveStream(self, url):
1128 video_lang = url.split('/')[-4]
1129 info = self.grep_webpage(
1131 r'src="(.*?/videothek_js.*?\.js)',
1134 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1137 http_host = url.split('/')[2]
1138 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1139 info = self.grep_webpage(
1141 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1142 '(http://.*?\.swf).*?' +
1146 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1147 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1148 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1151 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1153 def extractPlus7Stream(self, url):
1154 video_lang = url.split('/')[-3]
1155 info = self.grep_webpage(
1157 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1160 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1163 next_url = compat_urllib_parse.unquote(info.get('url'))
1164 info = self.grep_webpage(
1166 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1169 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1172 next_url = compat_urllib_parse.unquote(info.get('url'))
1174 info = self.grep_webpage(
1176 r'<video id="(.*?)".*?>.*?' +
1177 '<name>(.*?)</name>.*?' +
1178 '<dateVideo>(.*?)</dateVideo>.*?' +
1179 '<url quality="hd">(.*?)</url>',
1182 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1183 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1184 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1185 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1190 'id': info.get('id'),
1191 'url': compat_urllib_parse.unquote(info.get('url')),
1192 'uploader': u'arte.tv',
1193 'upload_date': info.get('date'),
1194 'title': info.get('title').decode('utf-8'),
1200 def _real_extract(self, url):
1201 video_id = url.split('/')[-1]
1202 self.report_extraction(video_id)
1204 if re.search(self._LIVE_URL, video_id) is not None:
1205 self.extractLiveStream(url)
1208 info = self.extractPlus7Stream(url)
1213 class GenericIE(InfoExtractor):
1214 """Generic last-resort information extractor."""
1217 IE_NAME = u'generic'
1219 def __init__(self, downloader=None):
1220 InfoExtractor.__init__(self, downloader)
1222 def report_download_webpage(self, video_id):
1223 """Report webpage download."""
1224 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1225 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1227 def report_extraction(self, video_id):
1228 """Report information extraction."""
1229 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1231 def report_following_redirect(self, new_url):
1232 """Report information extraction."""
1233 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1235 def _test_redirect(self, url):
1236 """Check if it is a redirect, like url shorteners, in case restart chain."""
1237 class HeadRequest(compat_urllib_request.Request):
1238 def get_method(self):
1241 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1243 Subclass the HTTPRedirectHandler to make it use our
1244 HeadRequest also on the redirected URL
1246 def redirect_request(self, req, fp, code, msg, headers, newurl):
1247 if code in (301, 302, 303, 307):
1248 newurl = newurl.replace(' ', '%20')
1249 newheaders = dict((k,v) for k,v in req.headers.items()
1250 if k.lower() not in ("content-length", "content-type"))
1251 return HeadRequest(newurl,
1253 origin_req_host=req.get_origin_req_host(),
1256 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1258 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1260 Fallback to GET if HEAD is not allowed (405 HTTP error)
1262 def http_error_405(self, req, fp, code, msg, headers):
1266 newheaders = dict((k,v) for k,v in req.headers.items()
1267 if k.lower() not in ("content-length", "content-type"))
1268 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1270 origin_req_host=req.get_origin_req_host(),
1274 opener = compat_urllib_request.OpenerDirector()
1275 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1276 HTTPMethodFallback, HEADRedirectHandler,
1277 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1278 opener.add_handler(handler())
1280 response = opener.open(HeadRequest(url))
1281 new_url = response.geturl()
1286 self.report_following_redirect(new_url)
1287 self._downloader.download([new_url])
1290 def _real_extract(self, url):
1291 if self._test_redirect(url): return
1293 video_id = url.split('/')[-1]
1294 request = compat_urllib_request.Request(url)
1296 self.report_download_webpage(video_id)
1297 webpage = compat_urllib_request.urlopen(request).read()
1298 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1299 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1301 except ValueError as err:
1302 # since this is the last-resort InfoExtractor, if
1303 # this error is thrown, it'll be thrown here
1304 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1307 self.report_extraction(video_id)
1308 # Start with something easy: JW Player in SWFObject
1309 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1311 # Broaden the search a little bit
1312 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1314 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1317 # It's possible that one of the regexes
1318 # matched, but returned an empty group:
1319 if mobj.group(1) is None:
1320 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1323 video_url = compat_urllib_parse.unquote(mobj.group(1))
1324 video_id = os.path.basename(video_url)
1326 # here's a fun little line of code for you:
1327 video_extension = os.path.splitext(video_id)[1][1:]
1328 video_id = os.path.splitext(video_id)[0]
1330 # it's tempting to parse this further, but you would
1331 # have to take into account all the variations like
1332 # Video Title - Site Name
1333 # Site Name | Video Title
1334 # Video Title - Tagline | Site Name
1335 # and so on and so forth; it's just not practical
1336 mobj = re.search(r'<title>(.*)</title>', webpage)
1338 self._downloader.trouble(u'ERROR: unable to extract title')
1340 video_title = mobj.group(1)
1342 # video uploader is domain name
1343 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1345 self._downloader.trouble(u'ERROR: unable to extract title')
1347 video_uploader = mobj.group(1)
1352 'uploader': video_uploader,
1353 'upload_date': None,
1354 'title': video_title,
1355 'ext': video_extension,
1359 class YoutubeSearchIE(InfoExtractor):
1360 """Information Extractor for YouTube search queries."""
1361 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1362 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1363 _max_youtube_results = 1000
1364 IE_NAME = u'youtube:search'
1366 def __init__(self, downloader=None):
1367 InfoExtractor.__init__(self, downloader)
1369 def report_download_page(self, query, pagenum):
1370 """Report attempt to download search page with given number."""
1371 query = query.decode(preferredencoding())
1372 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1374 def _real_extract(self, query):
1375 mobj = re.match(self._VALID_URL, query)
1377 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1380 prefix, query = query.split(':')
1382 query = query.encode('utf-8')
1384 self._download_n_results(query, 1)
1386 elif prefix == 'all':
1387 self._download_n_results(query, self._max_youtube_results)
1393 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1395 elif n > self._max_youtube_results:
1396 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1397 n = self._max_youtube_results
1398 self._download_n_results(query, n)
1400 except ValueError: # parsing prefix as integer fails
1401 self._download_n_results(query, 1)
1404 def _download_n_results(self, query, n):
1405 """Downloads a specified number of results for a query"""
1411 while (50 * pagenum) < limit:
1412 self.report_download_page(query, pagenum+1)
1413 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1414 request = compat_urllib_request.Request(result_url)
1416 data = compat_urllib_request.urlopen(request).read()
1417 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1418 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1420 api_response = json.loads(data)['data']
1422 new_ids = list(video['id'] for video in api_response['items'])
1423 video_ids += new_ids
1425 limit = min(n, api_response['totalItems'])
1428 if len(video_ids) > n:
1429 video_ids = video_ids[:n]
1430 for id in video_ids:
1431 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1435 class GoogleSearchIE(InfoExtractor):
1436 """Information Extractor for Google Video search queries."""
1437 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1438 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1439 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1440 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1441 _max_google_results = 1000
1442 IE_NAME = u'video.google:search'
1444 def __init__(self, downloader=None):
1445 InfoExtractor.__init__(self, downloader)
1447 def report_download_page(self, query, pagenum):
1448 """Report attempt to download playlist page with given number."""
1449 query = query.decode(preferredencoding())
1450 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1452 def _real_extract(self, query):
1453 mobj = re.match(self._VALID_URL, query)
1455 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1458 prefix, query = query.split(':')
1460 query = query.encode('utf-8')
1462 self._download_n_results(query, 1)
1464 elif prefix == 'all':
1465 self._download_n_results(query, self._max_google_results)
1471 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1473 elif n > self._max_google_results:
1474 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1475 n = self._max_google_results
1476 self._download_n_results(query, n)
1478 except ValueError: # parsing prefix as integer fails
1479 self._download_n_results(query, 1)
1482 def _download_n_results(self, query, n):
1483 """Downloads a specified number of results for a query"""
1489 self.report_download_page(query, pagenum)
1490 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1491 request = compat_urllib_request.Request(result_url)
1493 page = compat_urllib_request.urlopen(request).read()
1494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1495 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1498 # Extract video identifiers
1499 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1500 video_id = mobj.group(1)
1501 if video_id not in video_ids:
1502 video_ids.append(video_id)
1503 if len(video_ids) == n:
1504 # Specified n videos reached
1505 for id in video_ids:
1506 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1509 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1510 for id in video_ids:
1511 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1514 pagenum = pagenum + 1
1517 class YahooSearchIE(InfoExtractor):
1518 """Information Extractor for Yahoo! Video search queries."""
1521 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1522 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1523 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1524 _MORE_PAGES_INDICATOR = r'\s*Next'
1525 _max_yahoo_results = 1000
1526 IE_NAME = u'video.yahoo:search'
1528 def __init__(self, downloader=None):
1529 InfoExtractor.__init__(self, downloader)
1531 def report_download_page(self, query, pagenum):
1532 """Report attempt to download playlist page with given number."""
1533 query = query.decode(preferredencoding())
1534 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1536 def _real_extract(self, query):
1537 mobj = re.match(self._VALID_URL, query)
1539 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1542 prefix, query = query.split(':')
1544 query = query.encode('utf-8')
1546 self._download_n_results(query, 1)
1548 elif prefix == 'all':
1549 self._download_n_results(query, self._max_yahoo_results)
1555 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1557 elif n > self._max_yahoo_results:
1558 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1559 n = self._max_yahoo_results
1560 self._download_n_results(query, n)
1562 except ValueError: # parsing prefix as integer fails
1563 self._download_n_results(query, 1)
1566 def _download_n_results(self, query, n):
1567 """Downloads a specified number of results for a query"""
1570 already_seen = set()
1574 self.report_download_page(query, pagenum)
1575 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1576 request = compat_urllib_request.Request(result_url)
1578 page = compat_urllib_request.urlopen(request).read()
1579 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1580 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1583 # Extract video identifiers
1584 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1585 video_id = mobj.group(1)
1586 if video_id not in already_seen:
1587 video_ids.append(video_id)
1588 already_seen.add(video_id)
1589 if len(video_ids) == n:
1590 # Specified n videos reached
1591 for id in video_ids:
1592 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1595 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1596 for id in video_ids:
1597 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1600 pagenum = pagenum + 1
1603 class YoutubePlaylistIE(InfoExtractor):
1604 """Information Extractor for YouTube playlists."""
1606 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1607 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1608 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1609 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1610 IE_NAME = u'youtube:playlist'
1612 def __init__(self, downloader=None):
1613 InfoExtractor.__init__(self, downloader)
1615 def report_download_page(self, playlist_id, pagenum):
1616 """Report attempt to download playlist page with given number."""
1617 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1619 def _real_extract(self, url):
1620 # Extract playlist id
1621 mobj = re.match(self._VALID_URL, url)
1623 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1627 if mobj.group(3) is not None:
1628 self._downloader.download([mobj.group(3)])
1631 # Download playlist pages
1632 # prefix is 'p' as default for playlists but there are other types that need extra care
1633 playlist_prefix = mobj.group(1)
1634 if playlist_prefix == 'a':
1635 playlist_access = 'artist'
1637 playlist_prefix = 'p'
1638 playlist_access = 'view_play_list'
1639 playlist_id = mobj.group(2)
1644 self.report_download_page(playlist_id, pagenum)
1645 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1646 request = compat_urllib_request.Request(url)
1648 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1649 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1650 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1653 # Extract video identifiers
1655 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1656 if mobj.group(1) not in ids_in_page:
1657 ids_in_page.append(mobj.group(1))
1658 video_ids.extend(ids_in_page)
1660 if self._MORE_PAGES_INDICATOR not in page:
1662 pagenum = pagenum + 1
1664 total = len(video_ids)
1666 playliststart = self._downloader.params.get('playliststart', 1) - 1
1667 playlistend = self._downloader.params.get('playlistend', -1)
1668 if playlistend == -1:
1669 video_ids = video_ids[playliststart:]
1671 video_ids = video_ids[playliststart:playlistend]
1673 if len(video_ids) == total:
1674 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1676 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1678 for id in video_ids:
1679 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1683 class YoutubeChannelIE(InfoExtractor):
1684 """Information Extractor for YouTube channels."""
1686 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1687 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1688 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1689 IE_NAME = u'youtube:channel'
1691 def report_download_page(self, channel_id, pagenum):
1692 """Report attempt to download channel page with given number."""
1693 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1695 def _real_extract(self, url):
1696 # Extract channel id
1697 mobj = re.match(self._VALID_URL, url)
1699 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1702 # Download channel pages
1703 channel_id = mobj.group(1)
1708 self.report_download_page(channel_id, pagenum)
1709 url = self._TEMPLATE_URL % (channel_id, pagenum)
1710 request = compat_urllib_request.Request(url)
1712 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1713 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1714 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1717 # Extract video identifiers
1719 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1720 if mobj.group(1) not in ids_in_page:
1721 ids_in_page.append(mobj.group(1))
1722 video_ids.extend(ids_in_page)
1724 if self._MORE_PAGES_INDICATOR not in page:
1726 pagenum = pagenum + 1
1728 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1730 for id in video_ids:
1731 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1735 class YoutubeUserIE(InfoExtractor):
1736 """Information Extractor for YouTube users."""
1738 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1739 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1740 _GDATA_PAGE_SIZE = 50
1741 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1742 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1743 IE_NAME = u'youtube:user'
1745 def __init__(self, downloader=None):
1746 InfoExtractor.__init__(self, downloader)
1748 def report_download_page(self, username, start_index):
1749 """Report attempt to download user page."""
1750 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1751 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1753 def _real_extract(self, url):
1755 mobj = re.match(self._VALID_URL, url)
1757 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1760 username = mobj.group(1)
1762 # Download video ids using YouTube Data API. Result size per
1763 # query is limited (currently to 50 videos) so we need to query
1764 # page by page until there are no video ids - it means we got
1771 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1772 self.report_download_page(username, start_index)
1774 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1777 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1778 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1779 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1782 # Extract video identifiers
1785 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1786 if mobj.group(1) not in ids_in_page:
1787 ids_in_page.append(mobj.group(1))
1789 video_ids.extend(ids_in_page)
1791 # A little optimization - if current page is not
1792 # "full", ie. does not contain PAGE_SIZE video ids then
1793 # we can assume that this page is the last one - there
1794 # are no more ids on further pages - no need to query
1797 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1802 all_ids_count = len(video_ids)
1803 playliststart = self._downloader.params.get('playliststart', 1) - 1
1804 playlistend = self._downloader.params.get('playlistend', -1)
1806 if playlistend == -1:
1807 video_ids = video_ids[playliststart:]
1809 video_ids = video_ids[playliststart:playlistend]
1811 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1812 (username, all_ids_count, len(video_ids)))
1814 for video_id in video_ids:
1815 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1818 class BlipTVUserIE(InfoExtractor):
1819 """Information Extractor for blip.tv users."""
1821 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1823 IE_NAME = u'blip.tv:user'
1825 def __init__(self, downloader=None):
1826 InfoExtractor.__init__(self, downloader)
1828 def report_download_page(self, username, pagenum):
1829 """Report attempt to download user page."""
1830 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1831 (self.IE_NAME, username, pagenum))
1833 def _real_extract(self, url):
1835 mobj = re.match(self._VALID_URL, url)
1837 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1840 username = mobj.group(1)
1842 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1844 request = compat_urllib_request.Request(url)
1847 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1848 mobj = re.search(r'data-users-id="([^"]+)"', page)
1849 page_base = page_base % mobj.group(1)
1850 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1851 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1855 # Download video ids using BlipTV Ajax calls. Result size per
1856 # query is limited (currently to 12 videos) so we need to query
1857 # page by page until there are no video ids - it means we got
1864 self.report_download_page(username, pagenum)
1866 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1869 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1870 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1871 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1874 # Extract video identifiers
1877 for mobj in re.finditer(r'href="/([^"]+)"', page):
1878 if mobj.group(1) not in ids_in_page:
1879 ids_in_page.append(unescapeHTML(mobj.group(1)))
1881 video_ids.extend(ids_in_page)
1883 # A little optimization - if current page is not
1884 # "full", ie. does not contain PAGE_SIZE video ids then
1885 # we can assume that this page is the last one - there
1886 # are no more ids on further pages - no need to query
1889 if len(ids_in_page) < self._PAGE_SIZE:
1894 all_ids_count = len(video_ids)
1895 playliststart = self._downloader.params.get('playliststart', 1) - 1
1896 playlistend = self._downloader.params.get('playlistend', -1)
1898 if playlistend == -1:
1899 video_ids = video_ids[playliststart:]
1901 video_ids = video_ids[playliststart:playlistend]
1903 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1904 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1906 for video_id in video_ids:
1907 self._downloader.download([u'http://blip.tv/'+video_id])
1910 class DepositFilesIE(InfoExtractor):
1911 """Information extractor for depositfiles.com"""
1913 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1914 IE_NAME = u'DepositFiles'
1916 def __init__(self, downloader=None):
1917 InfoExtractor.__init__(self, downloader)
1919 def report_download_webpage(self, file_id):
1920 """Report webpage download."""
1921 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1923 def report_extraction(self, file_id):
1924 """Report information extraction."""
1925 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1927 def _real_extract(self, url):
1928 file_id = url.split('/')[-1]
1929 # Rebuild url in english locale
1930 url = 'http://depositfiles.com/en/files/' + file_id
1932 # Retrieve file webpage with 'Free download' button pressed
1933 free_download_indication = { 'gateway_result' : '1' }
1934 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1936 self.report_download_webpage(file_id)
1937 webpage = compat_urllib_request.urlopen(request).read()
1938 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1939 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1942 # Search for the real file URL
1943 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1944 if (mobj is None) or (mobj.group(1) is None):
1945 # Try to figure out reason of the error.
1946 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1947 if (mobj is not None) and (mobj.group(1) is not None):
1948 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1949 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1951 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1954 file_url = mobj.group(1)
1955 file_extension = os.path.splitext(file_url)[1][1:]
1957 # Search for file title
1958 mobj = re.search(r'<b title="(.*?)">', webpage)
1960 self._downloader.trouble(u'ERROR: unable to extract title')
1962 file_title = mobj.group(1).decode('utf-8')
1965 'id': file_id.decode('utf-8'),
1966 'url': file_url.decode('utf-8'),
1968 'upload_date': None,
1969 'title': file_title,
1970 'ext': file_extension.decode('utf-8'),
1974 class FacebookIE(InfoExtractor):
1975 """Information Extractor for Facebook"""
1978 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1979 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1980 _NETRC_MACHINE = 'facebook'
1981 _available_formats = ['video', 'highqual', 'lowqual']
1982 _video_extensions = {
1987 IE_NAME = u'facebook'
1989 def __init__(self, downloader=None):
1990 InfoExtractor.__init__(self, downloader)
1992 def _reporter(self, message):
1993 """Add header and report message."""
1994 self._downloader.to_screen(u'[facebook] %s' % message)
1996 def report_login(self):
1997 """Report attempt to log in."""
1998 self._reporter(u'Logging in')
2000 def report_video_webpage_download(self, video_id):
2001 """Report attempt to download video webpage."""
2002 self._reporter(u'%s: Downloading video webpage' % video_id)
2004 def report_information_extraction(self, video_id):
2005 """Report attempt to extract video information."""
2006 self._reporter(u'%s: Extracting video information' % video_id)
2008 def _parse_page(self, video_webpage):
2009 """Extract video information from page"""
2011 data = {'title': r'\("video_title", "(.*?)"\)',
2012 'description': r'<div class="datawrap">(.*?)</div>',
2013 'owner': r'\("video_owner_name", "(.*?)"\)',
2014 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2017 for piece in data.keys():
2018 mobj = re.search(data[piece], video_webpage)
2019 if mobj is not None:
2020 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2024 for fmt in self._available_formats:
2025 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2026 if mobj is not None:
2027 # URL is in a Javascript segment inside an escaped Unicode format within
2028 # the generally utf-8 page
2029 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2030 video_info['video_urls'] = video_urls
2034 def _real_initialize(self):
2035 if self._downloader is None:
2040 downloader_params = self._downloader.params
2042 # Attempt to use provided username and password or .netrc data
2043 if downloader_params.get('username', None) is not None:
2044 useremail = downloader_params['username']
2045 password = downloader_params['password']
2046 elif downloader_params.get('usenetrc', False):
2048 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2049 if info is not None:
2053 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2054 except (IOError, netrc.NetrcParseError) as err:
2055 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2058 if useremail is None:
2067 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2070 login_results = compat_urllib_request.urlopen(request).read()
2071 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2072 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2074 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2075 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2078 def _real_extract(self, url):
2079 mobj = re.match(self._VALID_URL, url)
2081 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2083 video_id = mobj.group('ID')
2086 self.report_video_webpage_download(video_id)
2087 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2089 page = compat_urllib_request.urlopen(request)
2090 video_webpage = page.read()
2091 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2092 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2095 # Start extracting information
2096 self.report_information_extraction(video_id)
2098 # Extract information
2099 video_info = self._parse_page(video_webpage)
2102 if 'owner' not in video_info:
2103 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2105 video_uploader = video_info['owner']
2108 if 'title' not in video_info:
2109 self._downloader.trouble(u'ERROR: unable to extract video title')
2111 video_title = video_info['title']
2112 video_title = video_title.decode('utf-8')
2115 if 'thumbnail' not in video_info:
2116 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2117 video_thumbnail = ''
2119 video_thumbnail = video_info['thumbnail']
2123 if 'upload_date' in video_info:
2124 upload_time = video_info['upload_date']
2125 timetuple = email.utils.parsedate_tz(upload_time)
2126 if timetuple is not None:
2128 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2133 video_description = video_info.get('description', 'No description available.')
2135 url_map = video_info['video_urls']
2137 # Decide which formats to download
2138 req_format = self._downloader.params.get('format', None)
2139 format_limit = self._downloader.params.get('format_limit', None)
2141 if format_limit is not None and format_limit in self._available_formats:
2142 format_list = self._available_formats[self._available_formats.index(format_limit):]
2144 format_list = self._available_formats
2145 existing_formats = [x for x in format_list if x in url_map]
2146 if len(existing_formats) == 0:
2147 self._downloader.trouble(u'ERROR: no known formats available for video')
2149 if req_format is None:
2150 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2151 elif req_format == 'worst':
2152 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2153 elif req_format == '-1':
2154 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2157 if req_format not in url_map:
2158 self._downloader.trouble(u'ERROR: requested format not available')
2160 video_url_list = [(req_format, url_map[req_format])] # Specific format
2163 for format_param, video_real_url in video_url_list:
2165 video_extension = self._video_extensions.get(format_param, 'mp4')
2168 'id': video_id.decode('utf-8'),
2169 'url': video_real_url.decode('utf-8'),
2170 'uploader': video_uploader.decode('utf-8'),
2171 'upload_date': upload_date,
2172 'title': video_title,
2173 'ext': video_extension.decode('utf-8'),
2174 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2175 'thumbnail': video_thumbnail.decode('utf-8'),
2176 'description': video_description.decode('utf-8'),
2180 class BlipTVIE(InfoExtractor):
2181 """Information extractor for blip.tv"""
2183 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2184 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2185 IE_NAME = u'blip.tv'
2187 def report_extraction(self, file_id):
2188 """Report information extraction."""
2189 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2191 def report_direct_download(self, title):
2192 """Report information extraction."""
2193 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2195 def _real_extract(self, url):
2196 mobj = re.match(self._VALID_URL, url)
2198 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2205 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2206 request = compat_urllib_request.Request(json_url)
2207 self.report_extraction(mobj.group(1))
2210 urlh = compat_urllib_request.urlopen(request)
2211 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2212 basename = url.split('/')[-1]
2213 title,ext = os.path.splitext(basename)
2214 title = title.decode('UTF-8')
2215 ext = ext.replace('.', '')
2216 self.report_direct_download(title)
2221 'upload_date': None,
2226 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2227 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2229 if info is None: # Regular URL
2231 json_code_bytes = urlh.read()
2232 json_code = json_code_bytes.decode('utf-8')
2233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2234 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2238 json_data = json.loads(json_code)
2239 if 'Post' in json_data:
2240 data = json_data['Post']
2244 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2245 video_url = data['media']['url']
2246 umobj = re.match(self._URL_EXT, video_url)
2248 raise ValueError('Can not determine filename extension')
2249 ext = umobj.group(1)
2252 'id': data['item_id'],
2254 'uploader': data['display_name'],
2255 'upload_date': upload_date,
2256 'title': data['title'],
2258 'format': data['media']['mimeType'],
2259 'thumbnail': data['thumbnailUrl'],
2260 'description': data['description'],
2261 'player_url': data['embedUrl']
2263 except (ValueError,KeyError) as err:
2264 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2267 std_headers['User-Agent'] = 'iTunes/10.6.1'
2271 class MyVideoIE(InfoExtractor):
2272 """Information Extractor for myvideo.de."""
2274 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2275 IE_NAME = u'myvideo'
2277 def __init__(self, downloader=None):
2278 InfoExtractor.__init__(self, downloader)
2280 def report_download_webpage(self, video_id):
2281 """Report webpage download."""
2282 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2284 def report_extraction(self, video_id):
2285 """Report information extraction."""
2286 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2288 def _real_extract(self,url):
2289 mobj = re.match(self._VALID_URL, url)
2291 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2294 video_id = mobj.group(1)
2297 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2299 self.report_download_webpage(video_id)
2300 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2301 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2302 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2305 self.report_extraction(video_id)
2306 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2309 self._downloader.trouble(u'ERROR: unable to extract media URL')
2311 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2313 mobj = re.search('<title>([^<]+)</title>', webpage)
2315 self._downloader.trouble(u'ERROR: unable to extract title')
2318 video_title = mobj.group(1)
2324 'upload_date': None,
2325 'title': video_title,
2329 class ComedyCentralIE(InfoExtractor):
2330 """Information extractor for The Daily Show and Colbert Report """
2332 # urls can be abbreviations like :thedailyshow or :colbert
2333 # urls for episodes like:
2334 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2335 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2336 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2337 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2338 |(https?://)?(www\.)?
2339 (?P<showname>thedailyshow|colbertnation)\.com/
2340 (full-episodes/(?P<episode>.*)|
2342 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2343 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2345 IE_NAME = u'comedycentral'
2347 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2349 _video_extensions = {
2357 _video_dimensions = {
2366 def suitable(self, url):
2367 """Receives a URL and returns True if suitable for this IE."""
2368 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2370 def report_extraction(self, episode_id):
2371 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2373 def report_config_download(self, episode_id):
2374 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2376 def report_index_download(self, episode_id):
2377 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2379 def report_player_url(self, episode_id):
2380 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2383 def _print_formats(self, formats):
2384 print('Available formats:')
2386 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2389 def _real_extract(self, url):
2390 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2392 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2395 if mobj.group('shortname'):
2396 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2397 url = u'http://www.thedailyshow.com/full-episodes/'
2399 url = u'http://www.colbertnation.com/full-episodes/'
2400 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2401 assert mobj is not None
2403 if mobj.group('clip'):
2404 if mobj.group('showname') == 'thedailyshow':
2405 epTitle = mobj.group('tdstitle')
2407 epTitle = mobj.group('cntitle')
2410 dlNewest = not mobj.group('episode')
2412 epTitle = mobj.group('showname')
2414 epTitle = mobj.group('episode')
2416 req = compat_urllib_request.Request(url)
2417 self.report_extraction(epTitle)
2419 htmlHandle = compat_urllib_request.urlopen(req)
2420 html = htmlHandle.read()
2421 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2422 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2425 url = htmlHandle.geturl()
2426 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2428 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2430 if mobj.group('episode') == '':
2431 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2433 epTitle = mobj.group('episode')
2435 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2437 if len(mMovieParams) == 0:
2438 # The Colbert Report embeds the information in a without
2439 # a URL prefix; so extract the alternate reference
2440 # and then add the URL prefix manually.
2442 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2443 if len(altMovieParams) == 0:
2444 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2447 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2449 playerUrl_raw = mMovieParams[0][0]
2450 self.report_player_url(epTitle)
2452 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2453 playerUrl = urlHandle.geturl()
2454 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2455 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2458 uri = mMovieParams[0][1]
2459 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2460 self.report_index_download(epTitle)
2462 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2463 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2464 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2469 idoc = xml.etree.ElementTree.fromstring(indexXml)
2470 itemEls = idoc.findall('.//item')
2471 for itemEl in itemEls:
2472 mediaId = itemEl.findall('./guid')[0].text
2473 shortMediaId = mediaId.split(':')[-1]
2474 showId = mediaId.split(':')[-2].replace('.com', '')
2475 officialTitle = itemEl.findall('./title')[0].text
2476 officialDate = itemEl.findall('./pubDate')[0].text
2478 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2479 compat_urllib_parse.urlencode({'uri': mediaId}))
2480 configReq = compat_urllib_request.Request(configUrl)
2481 self.report_config_download(epTitle)
2483 configXml = compat_urllib_request.urlopen(configReq).read()
2484 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2488 cdoc = xml.etree.ElementTree.fromstring(configXml)
2490 for rendition in cdoc.findall('.//rendition'):
2491 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2495 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2498 if self._downloader.params.get('listformats', None):
2499 self._print_formats([i[0] for i in turls])
2502 # For now, just pick the highest bitrate
2503 format,video_url = turls[-1]
2505 # Get the format arg from the arg stream
2506 req_format = self._downloader.params.get('format', None)
2508 # Select format if we can find one
2511 format, video_url = f, v
2514 # Patch to download from alternative CDN, which does not
2515 # break on current RTMPDump builds
2516 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2517 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2519 if video_url.startswith(broken_cdn):
2520 video_url = video_url.replace(broken_cdn, better_cdn)
2522 effTitle = showId + u'-' + epTitle
2527 'upload_date': officialDate,
2532 'description': officialTitle,
2533 'player_url': None #playerUrl
2536 results.append(info)
2541 class EscapistIE(InfoExtractor):
2542 """Information extractor for The Escapist """
2544 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2545 IE_NAME = u'escapist'
2547 def report_extraction(self, showName):
2548 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2550 def report_config_download(self, showName):
2551 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2553 def _real_extract(self, url):
2554 mobj = re.match(self._VALID_URL, url)
2556 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2558 showName = mobj.group('showname')
2559 videoId = mobj.group('episode')
2561 self.report_extraction(showName)
2563 webPage = compat_urllib_request.urlopen(url)
2564 webPageBytes = webPage.read()
2565 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2566 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2567 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2568 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2571 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2572 description = unescapeHTML(descMatch.group(1))
2573 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2574 imgUrl = unescapeHTML(imgMatch.group(1))
2575 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2576 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2577 configUrlMatch = re.search('config=(.*)$', playerUrl)
2578 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2580 self.report_config_download(showName)
2582 configJSON = compat_urllib_request.urlopen(configUrl)
2583 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2584 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2586 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2589 # Technically, it's JavaScript, not JSON
2590 configJSON = configJSON.replace("'", '"')
2593 config = json.loads(configJSON)
2594 except (ValueError,) as err:
2595 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2598 playlist = config['playlist']
2599 videoUrl = playlist[1]['url']
2604 'uploader': showName,
2605 'upload_date': None,
2608 'thumbnail': imgUrl,
2609 'description': description,
2610 'player_url': playerUrl,
2616 class CollegeHumorIE(InfoExtractor):
2617 """Information extractor for collegehumor.com"""
2620 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2621 IE_NAME = u'collegehumor'
2623 def report_manifest(self, video_id):
2624 """Report information extraction."""
2625 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2627 def report_extraction(self, video_id):
2628 """Report information extraction."""
2629 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2631 def _real_extract(self, url):
2632 mobj = re.match(self._VALID_URL, url)
2634 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2636 video_id = mobj.group('videoid')
2641 'upload_date': None,
2644 self.report_extraction(video_id)
2645 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2647 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2648 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2649 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2652 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2654 videoNode = mdoc.findall('./video')[0]
2655 info['description'] = videoNode.findall('./description')[0].text
2656 info['title'] = videoNode.findall('./caption')[0].text
2657 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2658 manifest_url = videoNode.findall('./file')[0].text
2660 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2663 manifest_url += '?hdcore=2.10.3'
2664 self.report_manifest(video_id)
2666 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2667 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2668 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2671 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2673 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2674 node_id = media_node.attrib['url']
2675 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2676 except IndexError as err:
2677 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2680 url_pr = compat_urllib_parse_urlparse(manifest_url)
2681 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2688 class XVideosIE(InfoExtractor):
2689 """Information extractor for xvideos.com"""
2691 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2692 IE_NAME = u'xvideos'
2694 def report_webpage(self, video_id):
2695 """Report information extraction."""
2696 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2698 def report_extraction(self, video_id):
2699 """Report information extraction."""
2700 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2702 def _real_extract(self, url):
2703 mobj = re.match(self._VALID_URL, url)
2705 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2707 video_id = mobj.group(1)
2709 self.report_webpage(video_id)
2711 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2713 webpage_bytes = compat_urllib_request.urlopen(request).read()
2714 webpage = webpage_bytes.decode('utf-8', 'replace')
2715 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2716 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2719 self.report_extraction(video_id)
2723 mobj = re.search(r'flv_url=(.+?)&', webpage)
2725 self._downloader.trouble(u'ERROR: unable to extract video url')
2727 video_url = compat_urllib_parse.unquote(mobj.group(1))
2731 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2733 self._downloader.trouble(u'ERROR: unable to extract video title')
2735 video_title = mobj.group(1)
2738 # Extract video thumbnail
2739 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2741 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2743 video_thumbnail = mobj.group(0)
2749 'upload_date': None,
2750 'title': video_title,
2752 'thumbnail': video_thumbnail,
2753 'description': None,
2759 class SoundcloudIE(InfoExtractor):
2760 """Information extractor for soundcloud.com
2761 To access the media, the uid of the song and a stream token
2762 must be extracted from the page source and the script must make
2763 a request to media.soundcloud.com/crossdomain.xml. Then
2764 the media can be grabbed by requesting from an url composed
2765 of the stream token and uid
2768 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2769 IE_NAME = u'soundcloud'
2771 def __init__(self, downloader=None):
2772 InfoExtractor.__init__(self, downloader)
2774 def report_resolve(self, video_id):
2775 """Report information extraction."""
2776 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2778 def report_extraction(self, video_id):
2779 """Report information extraction."""
2780 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2782 def _real_extract(self, url):
2783 mobj = re.match(self._VALID_URL, url)
2785 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2788 # extract uploader (which is in the url)
2789 uploader = mobj.group(1)
2790 # extract simple title (uploader + slug of song title)
2791 slug_title = mobj.group(2)
2792 simple_title = uploader + u'-' + slug_title
2794 self.report_resolve('%s/%s' % (uploader, slug_title))
2796 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2797 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2798 request = compat_urllib_request.Request(resolv_url)
2800 info_json_bytes = compat_urllib_request.urlopen(request).read()
2801 info_json = info_json_bytes.decode('utf-8')
2802 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2803 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2806 info = json.loads(info_json)
2807 video_id = info['id']
2808 self.report_extraction('%s/%s' % (uploader, slug_title))
2810 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2811 request = compat_urllib_request.Request(streams_url)
2813 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2814 stream_json = stream_json_bytes.decode('utf-8')
2815 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2816 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2819 streams = json.loads(stream_json)
2820 mediaURL = streams['http_mp3_128_url']
2825 'uploader': info['user']['username'],
2826 'upload_date': info['created_at'],
2827 'title': info['title'],
2829 'description': info['description'],
2833 class InfoQIE(InfoExtractor):
2834 """Information extractor for infoq.com"""
2836 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2839 def report_webpage(self, video_id):
2840 """Report information extraction."""
2841 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2843 def report_extraction(self, video_id):
2844 """Report information extraction."""
2845 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2847 def _real_extract(self, url):
2848 mobj = re.match(self._VALID_URL, url)
2850 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2853 self.report_webpage(url)
2855 request = compat_urllib_request.Request(url)
2857 webpage = compat_urllib_request.urlopen(request).read()
2858 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2859 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2862 self.report_extraction(url)
2866 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2868 self._downloader.trouble(u'ERROR: unable to extract video url')
2870 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2874 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2876 self._downloader.trouble(u'ERROR: unable to extract video title')
2878 video_title = mobj.group(1).decode('utf-8')
2880 # Extract description
2881 video_description = u'No description available.'
2882 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2883 if mobj is not None:
2884 video_description = mobj.group(1).decode('utf-8')
2886 video_filename = video_url.split('/')[-1]
2887 video_id, extension = video_filename.split('.')
2893 'upload_date': None,
2894 'title': video_title,
2895 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2897 'description': video_description,
2902 class MixcloudIE(InfoExtractor):
2903 """Information extractor for www.mixcloud.com"""
2905 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2906 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2907 IE_NAME = u'mixcloud'
2909 def __init__(self, downloader=None):
2910 InfoExtractor.__init__(self, downloader)
2912 def report_download_json(self, file_id):
2913 """Report JSON download."""
2914 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2916 def report_extraction(self, file_id):
2917 """Report information extraction."""
2918 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2920 def get_urls(self, jsonData, fmt, bitrate='best'):
2921 """Get urls from 'audio_formats' section in json"""
2924 bitrate_list = jsonData[fmt]
2925 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2926 bitrate = max(bitrate_list) # select highest
2928 url_list = jsonData[fmt][bitrate]
2929 except TypeError: # we have no bitrate info.
2930 url_list = jsonData[fmt]
2933 def check_urls(self, url_list):
2934 """Returns 1st active url from list"""
2935 for url in url_list:
2937 compat_urllib_request.urlopen(url)
2939 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2944 def _print_formats(self, formats):
2945 print('Available formats:')
2946 for fmt in formats.keys():
2947 for b in formats[fmt]:
2949 ext = formats[fmt][b][0]
2950 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2951 except TypeError: # we have no bitrate info
2952 ext = formats[fmt][0]
2953 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2956 def _real_extract(self, url):
2957 mobj = re.match(self._VALID_URL, url)
2959 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2961 # extract uploader & filename from url
2962 uploader = mobj.group(1).decode('utf-8')
2963 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2965 # construct API request
2966 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2967 # retrieve .json file with links to files
2968 request = compat_urllib_request.Request(file_url)
2970 self.report_download_json(file_url)
2971 jsonData = compat_urllib_request.urlopen(request).read()
2972 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2973 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2977 json_data = json.loads(jsonData)
2978 player_url = json_data['player_swf_url']
2979 formats = dict(json_data['audio_formats'])
2981 req_format = self._downloader.params.get('format', None)
2984 if self._downloader.params.get('listformats', None):
2985 self._print_formats(formats)
2988 if req_format is None or req_format == 'best':
2989 for format_param in formats.keys():
2990 url_list = self.get_urls(formats, format_param)
2992 file_url = self.check_urls(url_list)
2993 if file_url is not None:
2996 if req_format not in formats:
2997 self._downloader.trouble(u'ERROR: format is not available')
3000 url_list = self.get_urls(formats, req_format)
3001 file_url = self.check_urls(url_list)
3002 format_param = req_format
3005 'id': file_id.decode('utf-8'),
3006 'url': file_url.decode('utf-8'),
3007 'uploader': uploader.decode('utf-8'),
3008 'upload_date': None,
3009 'title': json_data['name'],
3010 'ext': file_url.split('.')[-1].decode('utf-8'),
3011 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3012 'thumbnail': json_data['thumbnail_url'],
3013 'description': json_data['description'],
3014 'player_url': player_url.decode('utf-8'),
3017 class StanfordOpenClassroomIE(InfoExtractor):
3018 """Information extractor for Stanford's Open ClassRoom"""
3020 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3021 IE_NAME = u'stanfordoc'
3023 def report_download_webpage(self, objid):
3024 """Report information extraction."""
3025 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3027 def report_extraction(self, video_id):
3028 """Report information extraction."""
3029 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3031 def _real_extract(self, url):
3032 mobj = re.match(self._VALID_URL, url)
3034 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3037 if mobj.group('course') and mobj.group('video'): # A specific video
3038 course = mobj.group('course')
3039 video = mobj.group('video')
3041 'id': course + '_' + video,
3043 'upload_date': None,
3046 self.report_extraction(info['id'])
3047 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3048 xmlUrl = baseUrl + video + '.xml'
3050 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3051 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3052 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3054 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3056 info['title'] = mdoc.findall('./title')[0].text
3057 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3059 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3061 info['ext'] = info['url'].rpartition('.')[2]
3063 elif mobj.group('course'): # A course page
3064 course = mobj.group('course')
3069 'upload_date': None,
3072 self.report_download_webpage(info['id'])
3074 coursepage = compat_urllib_request.urlopen(url).read()
3075 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3076 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3079 m = re.search('<h1>([^<]+)</h1>', coursepage)
3081 info['title'] = unescapeHTML(m.group(1))
3083 info['title'] = info['id']
3085 m = re.search('<description>([^<]+)</description>', coursepage)
3087 info['description'] = unescapeHTML(m.group(1))
3089 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3092 'type': 'reference',
3093 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3097 for entry in info['list']:
3098 assert entry['type'] == 'reference'
3099 results += self.extract(entry['url'])
3104 'id': 'Stanford OpenClassroom',
3107 'upload_date': None,
3110 self.report_download_webpage(info['id'])
3111 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3113 rootpage = compat_urllib_request.urlopen(rootURL).read()
3114 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3115 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3118 info['title'] = info['id']
3120 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3123 'type': 'reference',
3124 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3129 for entry in info['list']:
3130 assert entry['type'] == 'reference'
3131 results += self.extract(entry['url'])
3134 class MTVIE(InfoExtractor):
3135 """Information extractor for MTV.com"""
3137 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3140 def report_webpage(self, video_id):
3141 """Report information extraction."""
3142 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3144 def report_extraction(self, video_id):
3145 """Report information extraction."""
3146 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3148 def _real_extract(self, url):
3149 mobj = re.match(self._VALID_URL, url)
3151 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3153 if not mobj.group('proto'):
3154 url = 'http://' + url
3155 video_id = mobj.group('videoid')
3156 self.report_webpage(video_id)
3158 request = compat_urllib_request.Request(url)
3160 webpage = compat_urllib_request.urlopen(request).read()
3161 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3162 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3165 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3167 self._downloader.trouble(u'ERROR: unable to extract song name')
3169 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3170 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3172 self._downloader.trouble(u'ERROR: unable to extract performer')
3174 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3175 video_title = performer + ' - ' + song_name
3177 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3179 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3181 mtvn_uri = mobj.group(1)
3183 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3185 self._downloader.trouble(u'ERROR: unable to extract content id')
3187 content_id = mobj.group(1)
3189 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3190 self.report_extraction(video_id)
3191 request = compat_urllib_request.Request(videogen_url)
3193 metadataXml = compat_urllib_request.urlopen(request).read()
3194 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3195 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3198 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3199 renditions = mdoc.findall('.//rendition')
3201 # For now, always pick the highest quality.
3202 rendition = renditions[-1]
3205 _,_,ext = rendition.attrib['type'].partition('/')
3206 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3207 video_url = rendition.find('./src').text
3209 self._downloader.trouble('Invalid rendition field.')
3215 'uploader': performer,
3216 'upload_date': None,
3217 'title': video_title,
3225 class YoukuIE(InfoExtractor):
3227 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3230 def __init__(self, downloader=None):
3231 InfoExtractor.__init__(self, downloader)
3233 def report_download_webpage(self, file_id):
3234 """Report webpage download."""
3235 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3237 def report_extraction(self, file_id):
3238 """Report information extraction."""
3239 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3242 nowTime = int(time.time() * 1000)
3243 random1 = random.randint(1000,1998)
3244 random2 = random.randint(1000,9999)
3246 return "%d%d%d" %(nowTime,random1,random2)
3248 def _get_file_ID_mix_string(self, seed):
3250 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3252 for i in range(len(source)):
3253 seed = (seed * 211 + 30031 ) % 65536
3254 index = math.floor(seed / 65536 * len(source) )
3255 mixed.append(source[int(index)])
3256 source.remove(source[int(index)])
3257 #return ''.join(mixed)
3260 def _get_file_id(self, fileId, seed):
3261 mixed = self._get_file_ID_mix_string(seed)
3262 ids = fileId.split('*')
3266 realId.append(mixed[int(ch)])
3267 return ''.join(realId)
3269 def _real_extract(self, url):
3270 mobj = re.match(self._VALID_URL, url)
3272 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3274 video_id = mobj.group('ID')
3276 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3278 request = compat_urllib_request.Request(info_url, None, std_headers)
3280 self.report_download_webpage(video_id)
3281 jsondata = compat_urllib_request.urlopen(request).read()
3282 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3283 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3286 self.report_extraction(video_id)
3288 jsonstr = jsondata.decode('utf-8')
3289 config = json.loads(jsonstr)
3291 video_title = config['data'][0]['title']
3292 seed = config['data'][0]['seed']
3294 format = self._downloader.params.get('format', None)
3295 supported_format = list(config['data'][0]['streamfileids'].keys())
3297 if format is None or format == 'best':
3298 if 'hd2' in supported_format:
3303 elif format == 'worst':
3311 fileid = config['data'][0]['streamfileids'][format]
3312 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3313 except (UnicodeDecodeError, ValueError, KeyError):
3314 self._downloader.trouble(u'ERROR: unable to extract info section')
3318 sid = self._gen_sid()
3319 fileid = self._get_file_id(fileid, seed)
3321 #column 8,9 of fileid represent the segment number
3322 #fileid[7:9] should be changed
3323 for index, key in enumerate(keys):
3325 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3326 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3329 'id': '%s_part%02d' % (video_id, index),
3330 'url': download_url,
3332 'upload_date': None,
3333 'title': video_title,
3336 files_info.append(info)
3341 class XNXXIE(InfoExtractor):
3342 """Information extractor for xnxx.com"""
3344 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3346 VIDEO_URL_RE = r'flv_url=(.*?)&'
3347 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3348 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3350 def report_webpage(self, video_id):
3351 """Report information extraction"""
3352 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3354 def report_extraction(self, video_id):
3355 """Report information extraction"""
3356 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3358 def _real_extract(self, url):
3359 mobj = re.match(self._VALID_URL, url)
3361 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3363 video_id = mobj.group(1)
3365 self.report_webpage(video_id)
3367 # Get webpage content
3369 webpage_bytes = compat_urllib_request.urlopen(url).read()
3370 webpage = webpage_bytes.decode('utf-8')
3371 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3372 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3375 result = re.search(self.VIDEO_URL_RE, webpage)
3377 self._downloader.trouble(u'ERROR: unable to extract video url')
3379 video_url = compat_urllib_parse.unquote(result.group(1))
3381 result = re.search(self.VIDEO_TITLE_RE, webpage)
3383 self._downloader.trouble(u'ERROR: unable to extract video title')
3385 video_title = result.group(1)
3387 result = re.search(self.VIDEO_THUMB_RE, webpage)
3389 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3391 video_thumbnail = result.group(1)
3397 'upload_date': None,
3398 'title': video_title,
3400 'thumbnail': video_thumbnail,
3401 'description': None,
3405 class GooglePlusIE(InfoExtractor):
3406 """Information extractor for plus.google.com."""
3408 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3409 IE_NAME = u'plus.google'
3411 def __init__(self, downloader=None):
3412 InfoExtractor.__init__(self, downloader)
3414 def report_extract_entry(self, url):
3415 """Report downloading extry"""
3416 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3418 def report_date(self, upload_date):
3419 """Report downloading extry"""
3420 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3422 def report_uploader(self, uploader):
3423 """Report downloading extry"""
3424 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3426 def report_title(self, video_title):
3427 """Report downloading extry"""
3428 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3430 def report_extract_vid_page(self, video_page):
3431 """Report information extraction."""
3432 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3434 def _real_extract(self, url):
3435 # Extract id from URL
3436 mobj = re.match(self._VALID_URL, url)
3438 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3441 post_url = mobj.group(0)
3442 video_id = mobj.group(1)
3444 video_extension = 'flv'
3446 # Step 1, Retrieve post webpage to extract further information
3447 self.report_extract_entry(post_url)
3448 request = compat_urllib_request.Request(post_url)
3450 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3451 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3452 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3455 # Extract update date
3457 pattern = 'title="Timestamp">(.*?)</a>'
3458 mobj = re.search(pattern, webpage)
3460 upload_date = mobj.group(1)
3461 # Convert timestring to a format suitable for filename
3462 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3463 upload_date = upload_date.strftime('%Y%m%d')
3464 self.report_date(upload_date)
3468 pattern = r'rel\="author".*?>(.*?)</a>'
3469 mobj = re.search(pattern, webpage)
3471 uploader = mobj.group(1)
3472 self.report_uploader(uploader)
3475 # Get the first line for title
3477 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3478 mobj = re.search(pattern, webpage)
3480 video_title = mobj.group(1)
3481 self.report_title(video_title)
3483 # Step 2, Stimulate clicking the image box to launch video
3484 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3485 mobj = re.search(pattern, webpage)
3487 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3489 video_page = mobj.group(1)
3490 request = compat_urllib_request.Request(video_page)
3492 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3493 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3494 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3496 self.report_extract_vid_page(video_page)
3499 # Extract video links on video page
3500 """Extract video links of all sizes"""
3501 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3502 mobj = re.findall(pattern, webpage)
3504 self._downloader.trouble(u'ERROR: unable to extract video links')
3506 # Sort in resolution
3507 links = sorted(mobj)
3509 # Choose the lowest of the sort, i.e. highest resolution
3510 video_url = links[-1]
3511 # Only get the url. The resolution part in the tuple has no use anymore
3512 video_url = video_url[-1]
3513 # Treat escaped \u0026 style hex
3515 video_url = video_url.decode("unicode_escape")
3516 except AttributeError: # Python 3
3517 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3523 'uploader': uploader,
3524 'upload_date': upload_date,
3525 'title': video_title,
3526 'ext': video_extension,
3529 class NBAIE(InfoExtractor):
3530 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3533 def report_extraction(self, video_id):
3534 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3536 def _real_extract(self, url):
3537 mobj = re.match(self._VALID_URL, url)
3539 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3542 video_id = mobj.group(1)
3543 if video_id.endswith('/index.html'):
3544 video_id = video_id[:-len('/index.html')]
3546 self.report_extraction(video_id)
3548 urlh = compat_urllib_request.urlopen(url)
3549 webpage_bytes = urlh.read()
3550 webpage = webpage_bytes.decode('utf-8', 'ignore')
3551 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3552 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3555 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3556 def _findProp(rexp, default=None):
3557 m = re.search(rexp, webpage)
3559 return unescapeHTML(m.group(1))
3563 shortened_video_id = video_id.rpartition('/')[2]
3564 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3566 'id': shortened_video_id,
3570 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3571 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3575 class JustinTVIE(InfoExtractor):
3576 """Information extractor for justin.tv and twitch.tv"""
3577 # TODO: One broadcast may be split into multiple videos. The key
3578 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3579 # starts at 1 and increases. Can we treat all parts as one video?
3581 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3582 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3583 _JUSTIN_PAGE_LIMIT = 100
3584 IE_NAME = u'justin.tv'
3586 def report_extraction(self, file_id):
3587 """Report information extraction."""
3588 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3590 def report_download_page(self, channel, offset):
3591 """Report attempt to download a single page of videos."""
3592 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3593 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3595 # Return count of items, list of *valid* items
3596 def _parse_page(self, url):
3598 urlh = compat_urllib_request.urlopen(url)
3599 webpage_bytes = urlh.read()
3600 webpage = webpage_bytes.decode('utf-8', 'ignore')
3601 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3602 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3605 response = json.loads(webpage)
3607 for clip in response:
3608 video_url = clip['video_file_url']
3610 video_extension = os.path.splitext(video_url)[1][1:]
3611 video_date = re.sub('-', '', clip['created_on'][:10])
3615 'title': clip['title'],
3616 'uploader': clip.get('user_id', clip.get('channel_id')),
3617 'upload_date': video_date,
3618 'ext': video_extension,
3620 return (len(response), info)
3622 def _real_extract(self, url):
3623 mobj = re.match(self._VALID_URL, url)
3625 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3628 api = 'http://api.justin.tv'
3629 video_id = mobj.group(mobj.lastindex)
3631 if mobj.lastindex == 1:
3633 api += '/channel/archives/%s.json'
3635 api += '/clip/show/%s.json'
3636 api = api % (video_id,)
3638 self.report_extraction(video_id)
3642 limit = self._JUSTIN_PAGE_LIMIT
3645 self.report_download_page(video_id, offset)
3646 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3647 page_count, page_info = self._parse_page(page_url)
3648 info.extend(page_info)
3649 if not paged or page_count != limit:
3654 class FunnyOrDieIE(InfoExtractor):
3655 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3656 IE_NAME = u'FunnyOrDie'
3658 def report_extraction(self, video_id):
3659 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3661 def _real_extract(self, url):
3662 mobj = re.match(self._VALID_URL, url)
3664 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3667 video_id = mobj.group('id')
3668 self.report_extraction(video_id)
3670 urlh = compat_urllib_request.urlopen(url)
3671 webpage_bytes = urlh.read()
3672 webpage = webpage_bytes.decode('utf-8', 'ignore')
3673 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3674 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3677 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3679 self._downloader.trouble(u'ERROR: unable to find video information')
3680 video_url = unescapeHTML(m.group('url'))
3682 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3684 self._downloader.trouble(u'Cannot find video title')
3685 title = unescapeHTML(m.group('title'))
3687 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3689 desc = unescapeHTML(m.group('desc'))
3698 'description': desc,
3702 class TweetReelIE(InfoExtractor):
3703 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3705 def report_extraction(self, video_id):
3706 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3708 def _real_extract(self, url):
3709 mobj = re.match(self._VALID_URL, url)
3711 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3714 video_id = mobj.group('id')
3715 self.report_extraction(video_id)
3717 urlh = compat_urllib_request.urlopen(url)
3718 webpage_bytes = urlh.read()
3719 webpage = webpage_bytes.decode('utf-8', 'ignore')
3720 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3721 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3724 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3726 self._downloader.trouble(u'ERROR: Cannot find status ID')
3727 status_id = m.group(1)
3729 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3731 self._downloader.trouble(u'WARNING: Cannot find description')
3732 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3734 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3736 self._downloader.trouble(u'ERROR: Cannot find uploader')
3737 uploader = unescapeHTML(m.group('uploader'))
3738 uploader_id = unescapeHTML(m.group('uploader_id'))
3740 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3742 self._downloader.trouble(u'ERROR: Cannot find upload date')
3743 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3746 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3753 'description': desc,
3754 'uploader': uploader,
3755 'uploader_id': uploader_id,
3756 'internal_id': status_id,
3757 'upload_date': upload_date