]> gitweb @ CieloNegro.org - youtube-dl.git/blob - youtube-dl
a1d1e2e0da00a8c829a022ce6082c6fbfe7af4ea
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__  = (
5         'Ricardo Garcia Gonzalez',
6         'Danny Colligan',
7         'Benjamin Johnson',
8         'Vasyl\' Vavrychuk',
9         'Witold Baryluk',
10         'Paweł Paprota',
11         'Gergely Imreh',
12         'Rogério Brito',
13         'Philipp Hagemeister',
14         'Sören Schulze',
15         'Kevin Ngo',
16         'Ori Avtalion',
17         )
18
19 __license__ = 'Public Domain'
20 __version__ = '2011.10.19'
21
22 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
23
24 import cookielib
25 import datetime
26 import gzip
27 import htmlentitydefs
28 import HTMLParser
29 import httplib
30 import locale
31 import math
32 import netrc
33 import os
34 import os.path
35 import re
36 import socket
37 import string
38 import subprocess
39 import sys
40 import time
41 import urllib
42 import urllib2
43 import warnings
44 import zlib
45
46 if os.name == 'nt':
47         import ctypes
48
49 try:
50         import email.utils
51 except ImportError: # Python 2.4
52         import email.Utils
53 try:
54         import cStringIO as StringIO
55 except ImportError:
56         import StringIO
57
58 # parse_qs was moved from the cgi module to the urlparse module recently.
59 try:
60         from urlparse import parse_qs
61 except ImportError:
62         from cgi import parse_qs
63
64 try:
65         import lxml.etree
66 except ImportError:
67         pass # Handled below
68
69 try:
70         import xml.etree.ElementTree
71 except ImportError: # Python<2.5: Not officially supported, but let it slip
72         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
73
74 std_headers = {
75         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
76         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
77         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
78         'Accept-Encoding': 'gzip, deflate',
79         'Accept-Language': 'en-us,en;q=0.5',
80 }
81
82 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
83
84 try:
85         import json
86 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
87         import re
88         class json(object):
89                 @staticmethod
90                 def loads(s):
91                         s = s.decode('UTF-8')
92                         def raiseError(msg, i):
93                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
94                         def skipSpace(i, expectMore=True):
95                                 while i < len(s) and s[i] in ' \t\r\n':
96                                         i += 1
97                                 if expectMore:
98                                         if i >= len(s):
99                                                 raiseError('Premature end', i)
100                                 return i
101                         def decodeEscape(match):
102                                 esc = match.group(1)
103                                 _STATIC = {
104                                         '"': '"',
105                                         '\\': '\\',
106                                         '/': '/',
107                                         'b': unichr(0x8),
108                                         'f': unichr(0xc),
109                                         'n': '\n',
110                                         'r': '\r',
111                                         't': '\t',
112                                 }
113                                 if esc in _STATIC:
114                                         return _STATIC[esc]
115                                 if esc[0] == 'u':
116                                         if len(esc) == 1+4:
117                                                 return unichr(int(esc[1:5], 16))
118                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
119                                                 hi = int(esc[1:5], 16)
120                                                 low = int(esc[7:11], 16)
121                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
122                                 raise ValueError('Unknown escape ' + str(esc))
123                         def parseString(i):
124                                 i += 1
125                                 e = i
126                                 while True:
127                                         e = s.index('"', e)
128                                         bslashes = 0
129                                         while s[e-bslashes-1] == '\\':
130                                                 bslashes += 1
131                                         if bslashes % 2 == 1:
132                                                 e += 1
133                                                 continue
134                                         break
135                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
136                                 stri = rexp.sub(decodeEscape, s[i:e])
137                                 return (e+1,stri)
138                         def parseObj(i):
139                                 i += 1
140                                 res = {}
141                                 i = skipSpace(i)
142                                 if s[i] == '}': # Empty dictionary
143                                         return (i+1,res)
144                                 while True:
145                                         if s[i] != '"':
146                                                 raiseError('Expected a string object key', i)
147                                         i,key = parseString(i)
148                                         i = skipSpace(i)
149                                         if i >= len(s) or s[i] != ':':
150                                                 raiseError('Expected a colon', i)
151                                         i,val = parse(i+1)
152                                         res[key] = val
153                                         i = skipSpace(i)
154                                         if s[i] == '}':
155                                                 return (i+1, res)
156                                         if s[i] != ',':
157                                                 raiseError('Expected comma or closing curly brace', i)
158                                         i = skipSpace(i+1)
159                         def parseArray(i):
160                                 res = []
161                                 i = skipSpace(i+1)
162                                 if s[i] == ']': # Empty array
163                                         return (i+1,res)
164                                 while True:
165                                         i,val = parse(i)
166                                         res.append(val)
167                                         i = skipSpace(i) # Raise exception if premature end
168                                         if s[i] == ']':
169                                                 return (i+1, res)
170                                         if s[i] != ',':
171                                                 raiseError('Expected a comma or closing bracket', i)
172                                         i = skipSpace(i+1)
173                         def parseDiscrete(i):
174                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
175                                         if s.startswith(k, i):
176                                                 return (i+len(k), v)
177                                 raiseError('Not a boolean (or null)', i)
178                         def parseNumber(i):
179                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
180                                 if mobj is None:
181                                         raiseError('Not a number', i)
182                                 nums = mobj.group(1)
183                                 if '.' in nums or 'e' in nums or 'E' in nums:
184                                         return (i+len(nums), float(nums))
185                                 return (i+len(nums), int(nums))
186                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
187                         def parse(i):
188                                 i = skipSpace(i)
189                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
190                                 i = skipSpace(i, False)
191                                 return (i,res)
192                         i,res = parse(0)
193                         if i < len(s):
194                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
195                         return res
196
197 def preferredencoding():
198         """Get preferred encoding.
199
200         Returns the best encoding scheme for the system, based on
201         locale.getpreferredencoding() and some further tweaks.
202         """
203         def yield_preferredencoding():
204                 try:
205                         pref = locale.getpreferredencoding()
206                         u'TEST'.encode(pref)
207                 except:
208                         pref = 'UTF-8'
209                 while True:
210                         yield pref
211         return yield_preferredencoding().next()
212
213
214 def htmlentity_transform(matchobj):
215         """Transforms an HTML entity to a Unicode character.
216
217         This function receives a match object and is intended to be used with
218         the re.sub() function.
219         """
220         entity = matchobj.group(1)
221
222         # Known non-numeric HTML entity
223         if entity in htmlentitydefs.name2codepoint:
224                 return unichr(htmlentitydefs.name2codepoint[entity])
225
226         # Unicode character
227         mobj = re.match(ur'(?u)#(x?\d+)', entity)
228         if mobj is not None:
229                 numstr = mobj.group(1)
230                 if numstr.startswith(u'x'):
231                         base = 16
232                         numstr = u'0%s' % numstr
233                 else:
234                         base = 10
235                 return unichr(long(numstr, base))
236
237         # Unknown entity in name, return its literal representation
238         return (u'&%s;' % entity)
239
240
241 def sanitize_title(utitle):
242         """Sanitizes a video title so it could be used as part of a filename."""
243         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
244         return utitle.replace(unicode(os.sep), u'%')
245
246
247 def sanitize_open(filename, open_mode):
248         """Try to open the given filename, and slightly tweak it if this fails.
249
250         Attempts to open the given filename. If this fails, it tries to change
251         the filename slightly, step by step, until it's either able to open it
252         or it fails and raises a final exception, like the standard open()
253         function.
254
255         It returns the tuple (stream, definitive_file_name).
256         """
257         try:
258                 if filename == u'-':
259                         if sys.platform == 'win32':
260                                 import msvcrt
261                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
262                         return (sys.stdout, filename)
263                 stream = open(filename, open_mode)
264                 return (stream, filename)
265         except (IOError, OSError), err:
266                 # In case of error, try to remove win32 forbidden chars
267                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
268
269                 # An exception here should be caught in the caller
270                 stream = open(filename, open_mode)
271                 return (stream, filename)
272
273
274 def timeconvert(timestr):
275         """Convert RFC 2822 defined time string into system timestamp"""
276         timestamp = None
277         timetuple = email.utils.parsedate_tz(timestr)
278         if timetuple is not None:
279                 timestamp = email.utils.mktime_tz(timetuple)
280         return timestamp
281
282
283 class DownloadError(Exception):
284         """Download Error exception.
285
286         This exception may be thrown by FileDownloader objects if they are not
287         configured to continue on errors. They will contain the appropriate
288         error message.
289         """
290         pass
291
292
293 class SameFileError(Exception):
294         """Same File exception.
295
296         This exception will be thrown by FileDownloader objects if they detect
297         multiple files would have to be downloaded to the same file on disk.
298         """
299         pass
300
301
302 class PostProcessingError(Exception):
303         """Post Processing exception.
304
305         This exception may be raised by PostProcessor's .run() method to
306         indicate an error in the postprocessing task.
307         """
308         pass
309
310
311 class UnavailableVideoError(Exception):
312         """Unavailable Format exception.
313
314         This exception will be thrown when a video is requested
315         in a format that is not available for that video.
316         """
317         pass
318
319
320 class ContentTooShortError(Exception):
321         """Content Too Short exception.
322
323         This exception may be raised by FileDownloader objects when a file they
324         download is too small for what the server announced first, indicating
325         the connection was probably interrupted.
326         """
327         # Both in bytes
328         downloaded = None
329         expected = None
330
331         def __init__(self, downloaded, expected):
332                 self.downloaded = downloaded
333                 self.expected = expected
334
335
336 class YoutubeDLHandler(urllib2.HTTPHandler):
337         """Handler for HTTP requests and responses.
338
339         This class, when installed with an OpenerDirector, automatically adds
340         the standard headers to every HTTP request and handles gzipped and
341         deflated responses from web servers. If compression is to be avoided in
342         a particular request, the original request in the program code only has
343         to include the HTTP header "Youtubedl-No-Compression", which will be
344         removed before making the real request.
345
346         Part of this code was copied from:
347
348         http://techknack.net/python-urllib2-handlers/
349
350         Andrew Rowls, the author of that code, agreed to release it to the
351         public domain.
352         """
353
354         @staticmethod
355         def deflate(data):
356                 try:
357                         return zlib.decompress(data, -zlib.MAX_WBITS)
358                 except zlib.error:
359                         return zlib.decompress(data)
360
361         @staticmethod
362         def addinfourl_wrapper(stream, headers, url, code):
363                 if hasattr(urllib2.addinfourl, 'getcode'):
364                         return urllib2.addinfourl(stream, headers, url, code)
365                 ret = urllib2.addinfourl(stream, headers, url)
366                 ret.code = code
367                 return ret
368
369         def http_request(self, req):
370                 for h in std_headers:
371                         if h in req.headers:
372                                 del req.headers[h]
373                         req.add_header(h, std_headers[h])
374                 if 'Youtubedl-no-compression' in req.headers:
375                         if 'Accept-encoding' in req.headers:
376                                 del req.headers['Accept-encoding']
377                         del req.headers['Youtubedl-no-compression']
378                 return req
379
380         def http_response(self, req, resp):
381                 old_resp = resp
382                 # gzip
383                 if resp.headers.get('Content-encoding', '') == 'gzip':
384                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
385                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
386                         resp.msg = old_resp.msg
387                 # deflate
388                 if resp.headers.get('Content-encoding', '') == 'deflate':
389                         gz = StringIO.StringIO(self.deflate(resp.read()))
390                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
391                         resp.msg = old_resp.msg
392                 return resp
393
394
395 class FileDownloader(object):
396         """File Downloader class.
397
398         File downloader objects are the ones responsible of downloading the
399         actual video file and writing it to disk if the user has requested
400         it, among some other tasks. In most cases there should be one per
401         program. As, given a video URL, the downloader doesn't know how to
402         extract all the needed information, task that InfoExtractors do, it
403         has to pass the URL to one of them.
404
405         For this, file downloader objects have a method that allows
406         InfoExtractors to be registered in a given order. When it is passed
407         a URL, the file downloader handles it to the first InfoExtractor it
408         finds that reports being able to handle it. The InfoExtractor extracts
409         all the information about the video or videos the URL refers to, and
410         asks the FileDownloader to process the video information, possibly
411         downloading the video.
412
413         File downloaders accept a lot of parameters. In order not to saturate
414         the object constructor with arguments, it receives a dictionary of
415         options instead. These options are available through the params
416         attribute for the InfoExtractors to use. The FileDownloader also
417         registers itself as the downloader in charge for the InfoExtractors
418         that are added to it, so this is a "mutual registration".
419
420         Available options:
421
422         username:         Username for authentication purposes.
423         password:         Password for authentication purposes.
424         usenetrc:         Use netrc for authentication instead.
425         quiet:            Do not print messages to stdout.
426         forceurl:         Force printing final URL.
427         forcetitle:       Force printing title.
428         forcethumbnail:   Force printing thumbnail URL.
429         forcedescription: Force printing description.
430         forcefilename:    Force printing final filename.
431         simulate:         Do not download the video files.
432         format:           Video format code.
433         format_limit:     Highest quality format to try.
434         outtmpl:          Template for output names.
435         ignoreerrors:     Do not stop on download errors.
436         ratelimit:        Download speed limit, in bytes/sec.
437         nooverwrites:     Prevent overwriting files.
438         retries:          Number of times to retry for HTTP error 5xx
439         continuedl:       Try to continue downloads if possible.
440         noprogress:       Do not print the progress bar.
441         playliststart:    Playlist item to start at.
442         playlistend:      Playlist item to end at.
443         matchtitle:       Download only matching titles.
444         rejecttitle:      Reject downloads for matching titles.
445         logtostderr:      Log messages to stderr instead of stdout.
446         consoletitle:     Display progress in console window's titlebar.
447         nopart:           Do not use temporary .part files.
448         updatetime:       Use the Last-modified header to set output file timestamps.
449         writedescription: Write the video description to a .description file
450         writeinfojson:    Write the video description to a .info.json file
451         """
452
453         params = None
454         _ies = []
455         _pps = []
456         _download_retcode = None
457         _num_downloads = None
458         _screen_file = None
459
460         def __init__(self, params):
461                 """Create a FileDownloader object with the given options."""
462                 self._ies = []
463                 self._pps = []
464                 self._download_retcode = 0
465                 self._num_downloads = 0
466                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
467                 self.params = params
468
469         @staticmethod
470         def format_bytes(bytes):
471                 if bytes is None:
472                         return 'N/A'
473                 if type(bytes) is str:
474                         bytes = float(bytes)
475                 if bytes == 0.0:
476                         exponent = 0
477                 else:
478                         exponent = long(math.log(bytes, 1024.0))
479                 suffix = 'bkMGTPEZY'[exponent]
480                 converted = float(bytes) / float(1024 ** exponent)
481                 return '%.2f%s' % (converted, suffix)
482
483         @staticmethod
484         def calc_percent(byte_counter, data_len):
485                 if data_len is None:
486                         return '---.-%'
487                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
488
489         @staticmethod
490         def calc_eta(start, now, total, current):
491                 if total is None:
492                         return '--:--'
493                 dif = now - start
494                 if current == 0 or dif < 0.001: # One millisecond
495                         return '--:--'
496                 rate = float(current) / dif
497                 eta = long((float(total) - float(current)) / rate)
498                 (eta_mins, eta_secs) = divmod(eta, 60)
499                 if eta_mins > 99:
500                         return '--:--'
501                 return '%02d:%02d' % (eta_mins, eta_secs)
502
503         @staticmethod
504         def calc_speed(start, now, bytes):
505                 dif = now - start
506                 if bytes == 0 or dif < 0.001: # One millisecond
507                         return '%10s' % '---b/s'
508                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
509
510         @staticmethod
511         def best_block_size(elapsed_time, bytes):
512                 new_min = max(bytes / 2.0, 1.0)
513                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
514                 if elapsed_time < 0.001:
515                         return long(new_max)
516                 rate = bytes / elapsed_time
517                 if rate > new_max:
518                         return long(new_max)
519                 if rate < new_min:
520                         return long(new_min)
521                 return long(rate)
522
523         @staticmethod
524         def parse_bytes(bytestr):
525                 """Parse a string indicating a byte quantity into a long integer."""
526                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
527                 if matchobj is None:
528                         return None
529                 number = float(matchobj.group(1))
530                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
531                 return long(round(number * multiplier))
532
533         def add_info_extractor(self, ie):
534                 """Add an InfoExtractor object to the end of the list."""
535                 self._ies.append(ie)
536                 ie.set_downloader(self)
537
538         def add_post_processor(self, pp):
539                 """Add a PostProcessor object to the end of the chain."""
540                 self._pps.append(pp)
541                 pp.set_downloader(self)
542
543         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
544                 """Print message to stdout if not in quiet mode."""
545                 try:
546                         if not self.params.get('quiet', False):
547                                 terminator = [u'\n', u''][skip_eol]
548                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
549                         self._screen_file.flush()
550                 except (UnicodeEncodeError), err:
551                         if not ignore_encoding_errors:
552                                 raise
553
554         def to_stderr(self, message):
555                 """Print message to stderr."""
556                 print >>sys.stderr, message.encode(preferredencoding())
557
558         def to_cons_title(self, message):
559                 """Set console/terminal window title to message."""
560                 if not self.params.get('consoletitle', False):
561                         return
562                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
563                         # c_wchar_p() might not be necessary if `message` is
564                         # already of type unicode()
565                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
566                 elif 'TERM' in os.environ:
567                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
568
569         def fixed_template(self):
570                 """Checks if the output template is fixed."""
571                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
572
573         def trouble(self, message=None):
574                 """Determine action to take when a download problem appears.
575
576                 Depending on if the downloader has been configured to ignore
577                 download errors or not, this method may throw an exception or
578                 not when errors are found, after printing the message.
579                 """
580                 if message is not None:
581                         self.to_stderr(message)
582                 if not self.params.get('ignoreerrors', False):
583                         raise DownloadError(message)
584                 self._download_retcode = 1
585
586         def slow_down(self, start_time, byte_counter):
587                 """Sleep if the download speed is over the rate limit."""
588                 rate_limit = self.params.get('ratelimit', None)
589                 if rate_limit is None or byte_counter == 0:
590                         return
591                 now = time.time()
592                 elapsed = now - start_time
593                 if elapsed <= 0.0:
594                         return
595                 speed = float(byte_counter) / elapsed
596                 if speed > rate_limit:
597                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
598
599         def temp_name(self, filename):
600                 """Returns a temporary filename for the given filename."""
601                 if self.params.get('nopart', False) or filename == u'-' or \
602                                 (os.path.exists(filename) and not os.path.isfile(filename)):
603                         return filename
604                 return filename + u'.part'
605
606         def undo_temp_name(self, filename):
607                 if filename.endswith(u'.part'):
608                         return filename[:-len(u'.part')]
609                 return filename
610
611         def try_rename(self, old_filename, new_filename):
612                 try:
613                         if old_filename == new_filename:
614                                 return
615                         os.rename(old_filename, new_filename)
616                 except (IOError, OSError), err:
617                         self.trouble(u'ERROR: unable to rename file')
618
619         def try_utime(self, filename, last_modified_hdr):
620                 """Try to set the last-modified time of the given file."""
621                 if last_modified_hdr is None:
622                         return
623                 if not os.path.isfile(filename):
624                         return
625                 timestr = last_modified_hdr
626                 if timestr is None:
627                         return
628                 filetime = timeconvert(timestr)
629                 if filetime is None:
630                         return filetime
631                 try:
632                         os.utime(filename, (time.time(), filetime))
633                 except:
634                         pass
635                 return filetime
636
637         def report_writedescription(self, descfn):
638                 """ Report that the description file is being written """
639                 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
640
641         def report_writeinfojson(self, infofn):
642                 """ Report that the metadata file has been written """
643                 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
644
645         def report_destination(self, filename):
646                 """Report destination filename."""
647                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
648
649         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
650                 """Report download progress."""
651                 if self.params.get('noprogress', False):
652                         return
653                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
654                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
655                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
656                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
657
658         def report_resuming_byte(self, resume_len):
659                 """Report attempt to resume at given byte."""
660                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
661
662         def report_retry(self, count, retries):
663                 """Report retry in case of HTTP error 5xx"""
664                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
665
666         def report_file_already_downloaded(self, file_name):
667                 """Report file has already been fully downloaded."""
668                 try:
669                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
670                 except (UnicodeEncodeError), err:
671                         self.to_screen(u'[download] The file has already been downloaded')
672
673         def report_unable_to_resume(self):
674                 """Report it was impossible to resume download."""
675                 self.to_screen(u'[download] Unable to resume')
676
677         def report_finish(self):
678                 """Report download finished."""
679                 if self.params.get('noprogress', False):
680                         self.to_screen(u'[download] Download completed')
681                 else:
682                         self.to_screen(u'')
683
684         def increment_downloads(self):
685                 """Increment the ordinal that assigns a number to each file."""
686                 self._num_downloads += 1
687
688         def prepare_filename(self, info_dict):
689                 """Generate the output filename."""
690                 try:
691                         template_dict = dict(info_dict)
692                         template_dict['epoch'] = unicode(long(time.time()))
693                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
694                         filename = self.params['outtmpl'] % template_dict
695                         return filename
696                 except (ValueError, KeyError), err:
697                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
698                         return None
699
700         def process_info(self, info_dict):
701                 """Process a single dictionary returned by an InfoExtractor."""
702                 filename = self.prepare_filename(info_dict)
703                 
704                 # Forced printings
705                 if self.params.get('forcetitle', False):
706                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
707                 if self.params.get('forceurl', False):
708                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
709                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
710                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
711                 if self.params.get('forcedescription', False) and 'description' in info_dict:
712                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
713                 if self.params.get('forcefilename', False) and filename is not None:
714                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
715                 if self.params.get('forceformat', False):
716                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
717
718                 # Do nothing else if in simulate mode
719                 if self.params.get('simulate', False):
720                         return
721
722                 if filename is None:
723                         return
724
725                 matchtitle=self.params.get('matchtitle',False)
726                 rejecttitle=self.params.get('rejecttitle',False)
727                 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
728                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
729                         self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
730                         return
731                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
732                         self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
733                         return
734                         
735                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
736                         self.to_stderr(u'WARNING: file exists and will be skipped')
737                         return
738
739                 try:
740                         dn = os.path.dirname(filename)
741                         if dn != '' and not os.path.exists(dn):
742                                 os.makedirs(dn)
743                 except (OSError, IOError), err:
744                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
745                         return
746
747                 if self.params.get('writedescription', False):
748                         try:
749                                 descfn = filename + '.description'
750                                 self.report_writedescription(descfn)
751                                 descfile = open(descfn, 'wb')
752                                 try:
753                                         descfile.write(info_dict['description'].encode('utf-8'))
754                                 finally:
755                                         descfile.close()
756                         except (OSError, IOError):
757                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
758                                 return
759
760                 if self.params.get('writeinfojson', False):
761                         infofn = filename + '.info.json'
762                         self.report_writeinfojson(infofn)
763                         try:
764                                 json.dump
765                         except (NameError,AttributeError):
766                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
767                                 return
768                         try:
769                                 infof = open(infofn, 'wb')
770                                 try:
771                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
772                                         json.dump(json_info_dict, infof)
773                                 finally:
774                                         infof.close()
775                         except (OSError, IOError):
776                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
777                                 return
778
779                 if not self.params.get('skip_download', False):
780                         try:
781                                 success = self._do_download(filename, info_dict)
782                         except (OSError, IOError), err:
783                                 raise UnavailableVideoError
784                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
785                                 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
786                                 return
787                         except (ContentTooShortError, ), err:
788                                 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
789                                 return
790         
791                         if success:
792                                 try:
793                                         self.post_process(filename, info_dict)
794                                 except (PostProcessingError), err:
795                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
796                                         return
797
798         def download(self, url_list):
799                 """Download a given list of URLs."""
800                 if len(url_list) > 1 and self.fixed_template():
801                         raise SameFileError(self.params['outtmpl'])
802
803                 for url in url_list:
804                         suitable_found = False
805                         for ie in self._ies:
806                                 # Go to next InfoExtractor if not suitable
807                                 if not ie.suitable(url):
808                                         continue
809
810                                 # Suitable InfoExtractor found
811                                 suitable_found = True
812
813                                 # Extract information from URL and process it
814                                 ie.extract(url)
815
816                                 # Suitable InfoExtractor had been found; go to next URL
817                                 break
818
819                         if not suitable_found:
820                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
821
822                 return self._download_retcode
823
824         def post_process(self, filename, ie_info):
825                 """Run the postprocessing chain on the given file."""
826                 info = dict(ie_info)
827                 info['filepath'] = filename
828                 for pp in self._pps:
829                         info = pp.run(info)
830                         if info is None:
831                                 break
832
833         def _download_with_rtmpdump(self, filename, url, player_url):
834                 self.report_destination(filename)
835                 tmpfilename = self.temp_name(filename)
836
837                 # Check for rtmpdump first
838                 try:
839                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
840                 except (OSError, IOError):
841                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
842                         return False
843
844                 # Download using rtmpdump. rtmpdump returns exit code 2 when
845                 # the connection was interrumpted and resuming appears to be
846                 # possible. This is part of rtmpdump's normal usage, AFAIK.
847                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
848                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
849                 while retval == 2 or retval == 1:
850                         prevsize = os.path.getsize(tmpfilename)
851                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
852                         time.sleep(5.0) # This seems to be needed
853                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
854                         cursize = os.path.getsize(tmpfilename)
855                         if prevsize == cursize and retval == 1:
856                                 break
857                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
858                         if prevsize == cursize and retval == 2 and cursize > 1024:
859                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
860                                 retval = 0
861                                 break
862                 if retval == 0:
863                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
864                         self.try_rename(tmpfilename, filename)
865                         return True
866                 else:
867                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
868                         return False
869
870         def _do_download(self, filename, info_dict):
871                 url = info_dict['url']
872                 player_url = info_dict.get('player_url', None)
873
874                 # Check file already present
875                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
876                         self.report_file_already_downloaded(filename)
877                         return True
878
879                 # Attempt to download using rtmpdump
880                 if url.startswith('rtmp'):
881                         return self._download_with_rtmpdump(filename, url, player_url)
882
883                 tmpfilename = self.temp_name(filename)
884                 stream = None
885
886                 # Do not include the Accept-Encoding header
887                 headers = {'Youtubedl-no-compression': 'True'}
888                 basic_request = urllib2.Request(url, None, headers)
889                 request = urllib2.Request(url, None, headers)
890
891                 # Establish possible resume length
892                 if os.path.isfile(tmpfilename):
893                         resume_len = os.path.getsize(tmpfilename)
894                 else:
895                         resume_len = 0
896
897                 open_mode = 'wb'
898                 if resume_len != 0:
899                         if self.params.get('continuedl', False):
900                                 self.report_resuming_byte(resume_len)
901                                 request.add_header('Range','bytes=%d-' % resume_len)
902                                 open_mode = 'ab'
903                         else:
904                                 resume_len = 0
905
906                 count = 0
907                 retries = self.params.get('retries', 0)
908                 while count <= retries:
909                         # Establish connection
910                         try:
911                                 if count == 0 and 'urlhandle' in info_dict:
912                                         data = info_dict['urlhandle']
913                                 data = urllib2.urlopen(request)
914                                 break
915                         except (urllib2.HTTPError, ), err:
916                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
917                                         # Unexpected HTTP error
918                                         raise
919                                 elif err.code == 416:
920                                         # Unable to resume (requested range not satisfiable)
921                                         try:
922                                                 # Open the connection again without the range header
923                                                 data = urllib2.urlopen(basic_request)
924                                                 content_length = data.info()['Content-Length']
925                                         except (urllib2.HTTPError, ), err:
926                                                 if err.code < 500 or err.code >= 600:
927                                                         raise
928                                         else:
929                                                 # Examine the reported length
930                                                 if (content_length is not None and
931                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
932                                                         # The file had already been fully downloaded.
933                                                         # Explanation to the above condition: in issue #175 it was revealed that
934                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
935                                                         # changing the file size slightly and causing problems for some users. So
936                                                         # I decided to implement a suggested change and consider the file
937                                                         # completely downloaded if the file size differs less than 100 bytes from
938                                                         # the one in the hard drive.
939                                                         self.report_file_already_downloaded(filename)
940                                                         self.try_rename(tmpfilename, filename)
941                                                         return True
942                                                 else:
943                                                         # The length does not match, we start the download over
944                                                         self.report_unable_to_resume()
945                                                         open_mode = 'wb'
946                                                         break
947                         # Retry
948                         count += 1
949                         if count <= retries:
950                                 self.report_retry(count, retries)
951
952                 if count > retries:
953                         self.trouble(u'ERROR: giving up after %s retries' % retries)
954                         return False
955
956                 data_len = data.info().get('Content-length', None)
957                 if data_len is not None:
958                         data_len = long(data_len) + resume_len
959                 data_len_str = self.format_bytes(data_len)
960                 byte_counter = 0 + resume_len
961                 block_size = 1024
962                 start = time.time()
963                 while True:
964                         # Download and write
965                         before = time.time()
966                         data_block = data.read(block_size)
967                         after = time.time()
968                         if len(data_block) == 0:
969                                 break
970                         byte_counter += len(data_block)
971
972                         # Open file just in time
973                         if stream is None:
974                                 try:
975                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
976                                         assert stream is not None
977                                         filename = self.undo_temp_name(tmpfilename)
978                                         self.report_destination(filename)
979                                 except (OSError, IOError), err:
980                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
981                                         return False
982                         try:
983                                 stream.write(data_block)
984                         except (IOError, OSError), err:
985                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
986                                 return False
987                         block_size = self.best_block_size(after - before, len(data_block))
988
989                         # Progress message
990                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
991                         if data_len is None:
992                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
993                         else:
994                                 percent_str = self.calc_percent(byte_counter, data_len)
995                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
996                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
997
998                         # Apply rate limit
999                         self.slow_down(start, byte_counter - resume_len)
1000
1001                 if stream is None:
1002                         self.trouble(u'\nERROR: Did not get any data blocks')
1003                         return False
1004                 stream.close()
1005                 self.report_finish()
1006                 if data_len is not None and byte_counter != data_len:
1007                         raise ContentTooShortError(byte_counter, long(data_len))
1008                 self.try_rename(tmpfilename, filename)
1009
1010                 # Update file modification time
1011                 if self.params.get('updatetime', True):
1012                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1013
1014                 return True
1015
1016
1017 class InfoExtractor(object):
1018         """Information Extractor class.
1019
1020         Information extractors are the classes that, given a URL, extract
1021         information from the video (or videos) the URL refers to. This
1022         information includes the real video URL, the video title and simplified
1023         title, author and others. The information is stored in a dictionary
1024         which is then passed to the FileDownloader. The FileDownloader
1025         processes this information possibly downloading the video to the file
1026         system, among other possible outcomes. The dictionaries must include
1027         the following fields:
1028
1029         id:             Video identifier.
1030         url:            Final video URL.
1031         uploader:       Nickname of the video uploader.
1032         title:          Literal title.
1033         stitle:         Simplified title.
1034         ext:            Video filename extension.
1035         format:         Video format.
1036         player_url:     SWF Player URL (may be None).
1037
1038         The following fields are optional. Their primary purpose is to allow
1039         youtube-dl to serve as the backend for a video search function, such
1040         as the one in youtube2mp3.  They are only used when their respective
1041         forced printing functions are called:
1042
1043         thumbnail:      Full URL to a video thumbnail image.
1044         description:    One-line video description.
1045
1046         Subclasses of this one should re-define the _real_initialize() and
1047         _real_extract() methods and define a _VALID_URL regexp.
1048         Probably, they should also be added to the list of extractors.
1049         """
1050
1051         _ready = False
1052         _downloader = None
1053
1054         def __init__(self, downloader=None):
1055                 """Constructor. Receives an optional downloader."""
1056                 self._ready = False
1057                 self.set_downloader(downloader)
1058
1059         def suitable(self, url):
1060                 """Receives a URL and returns True if suitable for this IE."""
1061                 return re.match(self._VALID_URL, url) is not None
1062
1063         def initialize(self):
1064                 """Initializes an instance (authentication, etc)."""
1065                 if not self._ready:
1066                         self._real_initialize()
1067                         self._ready = True
1068
1069         def extract(self, url):
1070                 """Extracts URL information and returns it in list of dicts."""
1071                 self.initialize()
1072                 return self._real_extract(url)
1073
1074         def set_downloader(self, downloader):
1075                 """Sets the downloader for this IE."""
1076                 self._downloader = downloader
1077
1078         def _real_initialize(self):
1079                 """Real initialization process. Redefine in subclasses."""
1080                 pass
1081
1082         def _real_extract(self, url):
1083                 """Real extraction process. Redefine in subclasses."""
1084                 pass
1085
1086
1087 class YoutubeIE(InfoExtractor):
1088         """Information extractor for youtube.com."""
1089
1090         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1091         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1092         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1093         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1094         _NETRC_MACHINE = 'youtube'
1095         # Listed in order of quality
1096         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1097         _video_extensions = {
1098                 '13': '3gp',
1099                 '17': 'mp4',
1100                 '18': 'mp4',
1101                 '22': 'mp4',
1102                 '37': 'mp4',
1103                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1104                 '43': 'webm',
1105                 '44': 'webm',
1106                 '45': 'webm',
1107         }
1108         _video_dimensions = {
1109                 '5': '240x400',
1110                 '6': '???',
1111                 '13': '???',
1112                 '17': '144x176',
1113                 '18': '360x640',
1114                 '22': '720x1280',
1115                 '34': '360x640',
1116                 '35': '480x854',
1117                 '37': '1080x1920',
1118                 '38': '3072x4096',
1119                 '43': '360x640',
1120                 '44': '480x854',
1121                 '45': '720x1280',
1122         }       
1123         IE_NAME = u'youtube'
1124
1125         def report_lang(self):
1126                 """Report attempt to set language."""
1127                 self._downloader.to_screen(u'[youtube] Setting language')
1128
1129         def report_login(self):
1130                 """Report attempt to log in."""
1131                 self._downloader.to_screen(u'[youtube] Logging in')
1132
1133         def report_age_confirmation(self):
1134                 """Report attempt to confirm age."""
1135                 self._downloader.to_screen(u'[youtube] Confirming age')
1136
1137         def report_video_webpage_download(self, video_id):
1138                 """Report attempt to download video webpage."""
1139                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1140
1141         def report_video_info_webpage_download(self, video_id):
1142                 """Report attempt to download video info webpage."""
1143                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1144
1145         def report_information_extraction(self, video_id):
1146                 """Report attempt to extract video information."""
1147                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1148
1149         def report_unavailable_format(self, video_id, format):
1150                 """Report extracted video URL."""
1151                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1152
1153         def report_rtmp_download(self):
1154                 """Indicate the download will use the RTMP protocol."""
1155                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1156
1157         def _print_formats(self, formats):
1158                 print 'Available formats:'
1159                 for x in formats:
1160                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1161
1162         def _real_initialize(self):
1163                 if self._downloader is None:
1164                         return
1165
1166                 username = None
1167                 password = None
1168                 downloader_params = self._downloader.params
1169
1170                 # Attempt to use provided username and password or .netrc data
1171                 if downloader_params.get('username', None) is not None:
1172                         username = downloader_params['username']
1173                         password = downloader_params['password']
1174                 elif downloader_params.get('usenetrc', False):
1175                         try:
1176                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1177                                 if info is not None:
1178                                         username = info[0]
1179                                         password = info[2]
1180                                 else:
1181                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1182                         except (IOError, netrc.NetrcParseError), err:
1183                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1184                                 return
1185
1186                 # Set language
1187                 request = urllib2.Request(self._LANG_URL)
1188                 try:
1189                         self.report_lang()
1190                         urllib2.urlopen(request).read()
1191                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1192                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1193                         return
1194
1195                 # No authentication to be performed
1196                 if username is None:
1197                         return
1198
1199                 # Log in
1200                 login_form = {
1201                                 'current_form': 'loginForm',
1202                                 'next':         '/',
1203                                 'action_login': 'Log In',
1204                                 'username':     username,
1205                                 'password':     password,
1206                                 }
1207                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1208                 try:
1209                         self.report_login()
1210                         login_results = urllib2.urlopen(request).read()
1211                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1212                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1213                                 return
1214                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1215                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1216                         return
1217
1218                 # Confirm age
1219                 age_form = {
1220                                 'next_url':             '/',
1221                                 'action_confirm':       'Confirm',
1222                                 }
1223                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1224                 try:
1225                         self.report_age_confirmation()
1226                         age_results = urllib2.urlopen(request).read()
1227                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1228                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1229                         return
1230
1231         def _real_extract(self, url):
1232                 # Extract video id from URL
1233                 mobj = re.match(self._VALID_URL, url)
1234                 if mobj is None:
1235                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1236                         return
1237                 video_id = mobj.group(2)
1238
1239                 # Get video webpage
1240                 self.report_video_webpage_download(video_id)
1241                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1242                 try:
1243                         video_webpage = urllib2.urlopen(request).read()
1244                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1245                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1246                         return
1247
1248                 # Attempt to extract SWF player URL
1249                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1250                 if mobj is not None:
1251                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1252                 else:
1253                         player_url = None
1254
1255                 # Get video info
1256                 self.report_video_info_webpage_download(video_id)
1257                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1258                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1259                                         % (video_id, el_type))
1260                         request = urllib2.Request(video_info_url)
1261                         try:
1262                                 video_info_webpage = urllib2.urlopen(request).read()
1263                                 video_info = parse_qs(video_info_webpage)
1264                                 if 'token' in video_info:
1265                                         break
1266                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1267                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1268                                 return
1269                 if 'token' not in video_info:
1270                         if 'reason' in video_info:
1271                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1272                         else:
1273                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1274                         return
1275
1276                 # Start extracting information
1277                 self.report_information_extraction(video_id)
1278
1279                 # uploader
1280                 if 'author' not in video_info:
1281                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1282                         return
1283                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1284
1285                 # title
1286                 if 'title' not in video_info:
1287                         self._downloader.trouble(u'ERROR: unable to extract video title')
1288                         return
1289                 video_title = urllib.unquote_plus(video_info['title'][0])
1290                 video_title = video_title.decode('utf-8')
1291                 video_title = sanitize_title(video_title)
1292
1293                 # simplified title
1294                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1295                 simple_title = simple_title.strip(ur'_')
1296
1297                 # thumbnail image
1298                 if 'thumbnail_url' not in video_info:
1299                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1300                         video_thumbnail = ''
1301                 else:   # don't panic if we can't find it
1302                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1303
1304                 # upload date
1305                 upload_date = u'NA'
1306                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1307                 if mobj is not None:
1308                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1309                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1310                         for expression in format_expressions:
1311                                 try:
1312                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1313                                 except:
1314                                         pass
1315
1316                 # description
1317                 try:
1318                         lxml.etree
1319                 except NameError:
1320                         video_description = u'No description available.'
1321                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1322                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1323                                 if mobj is not None:
1324                                         video_description = mobj.group(1).decode('utf-8')
1325                 else:
1326                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1327                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1328                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1329                         # TODO use another parser
1330
1331                 # token
1332                 video_token = urllib.unquote_plus(video_info['token'][0])
1333
1334                 # Decide which formats to download
1335                 req_format = self._downloader.params.get('format', None)
1336
1337                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1338                         self.report_rtmp_download()
1339                         video_url_list = [(None, video_info['conn'][0])]
1340                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1341                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1342                         url_data = [parse_qs(uds) for uds in url_data_strs]
1343                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1344                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1345
1346                         format_limit = self._downloader.params.get('format_limit', None)
1347                         if format_limit is not None and format_limit in self._available_formats:
1348                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1349                         else:
1350                                 format_list = self._available_formats
1351                         existing_formats = [x for x in format_list if x in url_map]
1352                         if len(existing_formats) == 0:
1353                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1354                                 return
1355                         if self._downloader.params.get('listformats', None):
1356                                 self._print_formats(existing_formats)
1357                                 return
1358                         if req_format is None or req_format == 'best':
1359                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1360                         elif req_format == 'worst':
1361                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1362                         elif req_format in ('-1', 'all'):
1363                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1364                         else:
1365                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1366                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1367                                 req_formats = req_format.split('/')
1368                                 video_url_list = None
1369                                 for rf in req_formats:
1370                                         if rf in url_map:
1371                                                 video_url_list = [(rf, url_map[rf])]
1372                                                 break
1373                                 if video_url_list is None:
1374                                         self._downloader.trouble(u'ERROR: requested format not available')
1375                                         return
1376                 else:
1377                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1378                         return
1379
1380                 for format_param, video_real_url in video_url_list:
1381                         # At this point we have a new video
1382                         self._downloader.increment_downloads()
1383
1384                         # Extension
1385                         video_extension = self._video_extensions.get(format_param, 'flv')
1386
1387                         try:
1388                                 # Process video information
1389                                 self._downloader.process_info({
1390                                         'id':           video_id.decode('utf-8'),
1391                                         'url':          video_real_url.decode('utf-8'),
1392                                         'uploader':     video_uploader.decode('utf-8'),
1393                                         'upload_date':  upload_date,
1394                                         'title':        video_title,
1395                                         'stitle':       simple_title,
1396                                         'ext':          video_extension.decode('utf-8'),
1397                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1398                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1399                                         'description':  video_description,
1400                                         'player_url':   player_url,
1401                                 })
1402                         except UnavailableVideoError, err:
1403                                 self._downloader.trouble(u'\nERROR: unable to download video')
1404
1405
1406 class MetacafeIE(InfoExtractor):
1407         """Information Extractor for metacafe.com."""
1408
1409         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1410         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1411         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1412         _youtube_ie = None
1413         IE_NAME = u'metacafe'
1414
1415         def __init__(self, youtube_ie, downloader=None):
1416                 InfoExtractor.__init__(self, downloader)
1417                 self._youtube_ie = youtube_ie
1418
1419         def report_disclaimer(self):
1420                 """Report disclaimer retrieval."""
1421                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1422
1423         def report_age_confirmation(self):
1424                 """Report attempt to confirm age."""
1425                 self._downloader.to_screen(u'[metacafe] Confirming age')
1426
1427         def report_download_webpage(self, video_id):
1428                 """Report webpage download."""
1429                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1430
1431         def report_extraction(self, video_id):
1432                 """Report information extraction."""
1433                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1434
1435         def _real_initialize(self):
1436                 # Retrieve disclaimer
1437                 request = urllib2.Request(self._DISCLAIMER)
1438                 try:
1439                         self.report_disclaimer()
1440                         disclaimer = urllib2.urlopen(request).read()
1441                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1442                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1443                         return
1444
1445                 # Confirm age
1446                 disclaimer_form = {
1447                         'filters': '0',
1448                         'submit': "Continue - I'm over 18",
1449                         }
1450                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1451                 try:
1452                         self.report_age_confirmation()
1453                         disclaimer = urllib2.urlopen(request).read()
1454                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1455                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1456                         return
1457
1458         def _real_extract(self, url):
1459                 # Extract id and simplified title from URL
1460                 mobj = re.match(self._VALID_URL, url)
1461                 if mobj is None:
1462                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1463                         return
1464
1465                 video_id = mobj.group(1)
1466
1467                 # Check if video comes from YouTube
1468                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1469                 if mobj2 is not None:
1470                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1471                         return
1472
1473                 # At this point we have a new video
1474                 self._downloader.increment_downloads()
1475
1476                 simple_title = mobj.group(2).decode('utf-8')
1477
1478                 # Retrieve video webpage to extract further information
1479                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1480                 try:
1481                         self.report_download_webpage(video_id)
1482                         webpage = urllib2.urlopen(request).read()
1483                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1484                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1485                         return
1486
1487                 # Extract URL, uploader and title from webpage
1488                 self.report_extraction(video_id)
1489                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1490                 if mobj is not None:
1491                         mediaURL = urllib.unquote(mobj.group(1))
1492                         video_extension = mediaURL[-3:]
1493
1494                         # Extract gdaKey if available
1495                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1496                         if mobj is None:
1497                                 video_url = mediaURL
1498                         else:
1499                                 gdaKey = mobj.group(1)
1500                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1501                 else:
1502                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1503                         if mobj is None:
1504                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1505                                 return
1506                         vardict = parse_qs(mobj.group(1))
1507                         if 'mediaData' not in vardict:
1508                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1509                                 return
1510                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1511                         if mobj is None:
1512                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1513                                 return
1514                         mediaURL = mobj.group(1).replace('\\/', '/')
1515                         video_extension = mediaURL[-3:]
1516                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1517
1518                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1519                 if mobj is None:
1520                         self._downloader.trouble(u'ERROR: unable to extract title')
1521                         return
1522                 video_title = mobj.group(1).decode('utf-8')
1523                 video_title = sanitize_title(video_title)
1524
1525                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1526                 if mobj is None:
1527                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1528                         return
1529                 video_uploader = mobj.group(1)
1530
1531                 try:
1532                         # Process video information
1533                         self._downloader.process_info({
1534                                 'id':           video_id.decode('utf-8'),
1535                                 'url':          video_url.decode('utf-8'),
1536                                 'uploader':     video_uploader.decode('utf-8'),
1537                                 'upload_date':  u'NA',
1538                                 'title':        video_title,
1539                                 'stitle':       simple_title,
1540                                 'ext':          video_extension.decode('utf-8'),
1541                                 'format':       u'NA',
1542                                 'player_url':   None,
1543                         })
1544                 except UnavailableVideoError:
1545                         self._downloader.trouble(u'\nERROR: unable to download video')
1546
1547
1548 class DailymotionIE(InfoExtractor):
1549         """Information Extractor for Dailymotion"""
1550
1551         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1552         IE_NAME = u'dailymotion'
1553
1554         def __init__(self, downloader=None):
1555                 InfoExtractor.__init__(self, downloader)
1556
1557         def report_download_webpage(self, video_id):
1558                 """Report webpage download."""
1559                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1560
1561         def report_extraction(self, video_id):
1562                 """Report information extraction."""
1563                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1564
1565         def _real_extract(self, url):
1566                 # Extract id and simplified title from URL
1567                 mobj = re.match(self._VALID_URL, url)
1568                 if mobj is None:
1569                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1570                         return
1571
1572                 # At this point we have a new video
1573                 self._downloader.increment_downloads()
1574                 video_id = mobj.group(1)
1575
1576                 simple_title = mobj.group(2).decode('utf-8')
1577                 video_extension = 'flv'
1578
1579                 # Retrieve video webpage to extract further information
1580                 request = urllib2.Request(url)
1581                 request.add_header('Cookie', 'family_filter=off')
1582                 try:
1583                         self.report_download_webpage(video_id)
1584                         webpage = urllib2.urlopen(request).read()
1585                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1586                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1587                         return
1588
1589                 # Extract URL, uploader and title from webpage
1590                 self.report_extraction(video_id)
1591                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1592                 if mobj is None:
1593                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1594                         return
1595                 sequence = urllib.unquote(mobj.group(1))
1596                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1597                 if mobj is None:
1598                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1599                         return
1600                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1601
1602                 # if needed add http://www.dailymotion.com/ if relative URL
1603
1604                 video_url = mediaURL
1605
1606                 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1607                 if mobj is None:
1608                         self._downloader.trouble(u'ERROR: unable to extract title')
1609                         return
1610                 video_title = mobj.group(1).decode('utf-8')
1611                 video_title = sanitize_title(video_title)
1612
1613                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1614                 if mobj is None:
1615                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1616                         return
1617                 video_uploader = mobj.group(1)
1618
1619                 try:
1620                         # Process video information
1621                         self._downloader.process_info({
1622                                 'id':           video_id.decode('utf-8'),
1623                                 'url':          video_url.decode('utf-8'),
1624                                 'uploader':     video_uploader.decode('utf-8'),
1625                                 'upload_date':  u'NA',
1626                                 'title':        video_title,
1627                                 'stitle':       simple_title,
1628                                 'ext':          video_extension.decode('utf-8'),
1629                                 'format':       u'NA',
1630                                 'player_url':   None,
1631                         })
1632                 except UnavailableVideoError:
1633                         self._downloader.trouble(u'\nERROR: unable to download video')
1634
1635
1636 class GoogleIE(InfoExtractor):
1637         """Information extractor for video.google.com."""
1638
1639         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1640         IE_NAME = u'video.google'
1641
1642         def __init__(self, downloader=None):
1643                 InfoExtractor.__init__(self, downloader)
1644
1645         def report_download_webpage(self, video_id):
1646                 """Report webpage download."""
1647                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1648
1649         def report_extraction(self, video_id):
1650                 """Report information extraction."""
1651                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1652
1653         def _real_extract(self, url):
1654                 # Extract id from URL
1655                 mobj = re.match(self._VALID_URL, url)
1656                 if mobj is None:
1657                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1658                         return
1659
1660                 # At this point we have a new video
1661                 self._downloader.increment_downloads()
1662                 video_id = mobj.group(1)
1663
1664                 video_extension = 'mp4'
1665
1666                 # Retrieve video webpage to extract further information
1667                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1668                 try:
1669                         self.report_download_webpage(video_id)
1670                         webpage = urllib2.urlopen(request).read()
1671                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1672                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1673                         return
1674
1675                 # Extract URL, uploader, and title from webpage
1676                 self.report_extraction(video_id)
1677                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1678                 if mobj is None:
1679                         video_extension = 'flv'
1680                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1681                 if mobj is None:
1682                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1683                         return
1684                 mediaURL = urllib.unquote(mobj.group(1))
1685                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1686                 mediaURL = mediaURL.replace('\\x26', '\x26')
1687
1688                 video_url = mediaURL
1689
1690                 mobj = re.search(r'<title>(.*)</title>', webpage)
1691                 if mobj is None:
1692                         self._downloader.trouble(u'ERROR: unable to extract title')
1693                         return
1694                 video_title = mobj.group(1).decode('utf-8')
1695                 video_title = sanitize_title(video_title)
1696                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1697
1698                 # Extract video description
1699                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1700                 if mobj is None:
1701                         self._downloader.trouble(u'ERROR: unable to extract video description')
1702                         return
1703                 video_description = mobj.group(1).decode('utf-8')
1704                 if not video_description:
1705                         video_description = 'No description available.'
1706
1707                 # Extract video thumbnail
1708                 if self._downloader.params.get('forcethumbnail', False):
1709                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1710                         try:
1711                                 webpage = urllib2.urlopen(request).read()
1712                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1713                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1714                                 return
1715                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1716                         if mobj is None:
1717                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1718                                 return
1719                         video_thumbnail = mobj.group(1)
1720                 else:   # we need something to pass to process_info
1721                         video_thumbnail = ''
1722
1723                 try:
1724                         # Process video information
1725                         self._downloader.process_info({
1726                                 'id':           video_id.decode('utf-8'),
1727                                 'url':          video_url.decode('utf-8'),
1728                                 'uploader':     u'NA',
1729                                 'upload_date':  u'NA',
1730                                 'title':        video_title,
1731                                 'stitle':       simple_title,
1732                                 'ext':          video_extension.decode('utf-8'),
1733                                 'format':       u'NA',
1734                                 'player_url':   None,
1735                         })
1736                 except UnavailableVideoError:
1737                         self._downloader.trouble(u'\nERROR: unable to download video')
1738
1739
1740 class PhotobucketIE(InfoExtractor):
1741         """Information extractor for photobucket.com."""
1742
1743         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1744         IE_NAME = u'photobucket'
1745
1746         def __init__(self, downloader=None):
1747                 InfoExtractor.__init__(self, downloader)
1748
1749         def report_download_webpage(self, video_id):
1750                 """Report webpage download."""
1751                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1752
1753         def report_extraction(self, video_id):
1754                 """Report information extraction."""
1755                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1756
1757         def _real_extract(self, url):
1758                 # Extract id from URL
1759                 mobj = re.match(self._VALID_URL, url)
1760                 if mobj is None:
1761                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1762                         return
1763
1764                 # At this point we have a new video
1765                 self._downloader.increment_downloads()
1766                 video_id = mobj.group(1)
1767
1768                 video_extension = 'flv'
1769
1770                 # Retrieve video webpage to extract further information
1771                 request = urllib2.Request(url)
1772                 try:
1773                         self.report_download_webpage(video_id)
1774                         webpage = urllib2.urlopen(request).read()
1775                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1776                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1777                         return
1778
1779                 # Extract URL, uploader, and title from webpage
1780                 self.report_extraction(video_id)
1781                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1782                 if mobj is None:
1783                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1784                         return
1785                 mediaURL = urllib.unquote(mobj.group(1))
1786
1787                 video_url = mediaURL
1788
1789                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1790                 if mobj is None:
1791                         self._downloader.trouble(u'ERROR: unable to extract title')
1792                         return
1793                 video_title = mobj.group(1).decode('utf-8')
1794                 video_title = sanitize_title(video_title)
1795                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1796
1797                 video_uploader = mobj.group(2).decode('utf-8')
1798
1799                 try:
1800                         # Process video information
1801                         self._downloader.process_info({
1802                                 'id':           video_id.decode('utf-8'),
1803                                 'url':          video_url.decode('utf-8'),
1804                                 'uploader':     video_uploader,
1805                                 'upload_date':  u'NA',
1806                                 'title':        video_title,
1807                                 'stitle':       simple_title,
1808                                 'ext':          video_extension.decode('utf-8'),
1809                                 'format':       u'NA',
1810                                 'player_url':   None,
1811                         })
1812                 except UnavailableVideoError:
1813                         self._downloader.trouble(u'\nERROR: unable to download video')
1814
1815
1816 class YahooIE(InfoExtractor):
1817         """Information extractor for video.yahoo.com."""
1818
1819         # _VALID_URL matches all Yahoo! Video URLs
1820         # _VPAGE_URL matches only the extractable '/watch/' URLs
1821         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1822         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1823         IE_NAME = u'video.yahoo'
1824
1825         def __init__(self, downloader=None):
1826                 InfoExtractor.__init__(self, downloader)
1827
1828         def report_download_webpage(self, video_id):
1829                 """Report webpage download."""
1830                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1831
1832         def report_extraction(self, video_id):
1833                 """Report information extraction."""
1834                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1835
1836         def _real_extract(self, url, new_video=True):
1837                 # Extract ID from URL
1838                 mobj = re.match(self._VALID_URL, url)
1839                 if mobj is None:
1840                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1841                         return
1842
1843                 # At this point we have a new video
1844                 self._downloader.increment_downloads()
1845                 video_id = mobj.group(2)
1846                 video_extension = 'flv'
1847
1848                 # Rewrite valid but non-extractable URLs as
1849                 # extractable English language /watch/ URLs
1850                 if re.match(self._VPAGE_URL, url) is None:
1851                         request = urllib2.Request(url)
1852                         try:
1853                                 webpage = urllib2.urlopen(request).read()
1854                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1855                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1856                                 return
1857
1858                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1859                         if mobj is None:
1860                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1861                                 return
1862                         yahoo_id = mobj.group(1)
1863
1864                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1865                         if mobj is None:
1866                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1867                                 return
1868                         yahoo_vid = mobj.group(1)
1869
1870                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1871                         return self._real_extract(url, new_video=False)
1872
1873                 # Retrieve video webpage to extract further information
1874                 request = urllib2.Request(url)
1875                 try:
1876                         self.report_download_webpage(video_id)
1877                         webpage = urllib2.urlopen(request).read()
1878                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1879                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1880                         return
1881
1882                 # Extract uploader and title from webpage
1883                 self.report_extraction(video_id)
1884                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1885                 if mobj is None:
1886                         self._downloader.trouble(u'ERROR: unable to extract video title')
1887                         return
1888                 video_title = mobj.group(1).decode('utf-8')
1889                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1890
1891                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1892                 if mobj is None:
1893                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1894                         return
1895                 video_uploader = mobj.group(1).decode('utf-8')
1896
1897                 # Extract video thumbnail
1898                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1899                 if mobj is None:
1900                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1901                         return
1902                 video_thumbnail = mobj.group(1).decode('utf-8')
1903
1904                 # Extract video description
1905                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1906                 if mobj is None:
1907                         self._downloader.trouble(u'ERROR: unable to extract video description')
1908                         return
1909                 video_description = mobj.group(1).decode('utf-8')
1910                 if not video_description:
1911                         video_description = 'No description available.'
1912
1913                 # Extract video height and width
1914                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1915                 if mobj is None:
1916                         self._downloader.trouble(u'ERROR: unable to extract video height')
1917                         return
1918                 yv_video_height = mobj.group(1)
1919
1920                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1921                 if mobj is None:
1922                         self._downloader.trouble(u'ERROR: unable to extract video width')
1923                         return
1924                 yv_video_width = mobj.group(1)
1925
1926                 # Retrieve video playlist to extract media URL
1927                 # I'm not completely sure what all these options are, but we
1928                 # seem to need most of them, otherwise the server sends a 401.
1929                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1930                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1931                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1932                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1933                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1934                 try:
1935                         self.report_download_webpage(video_id)
1936                         webpage = urllib2.urlopen(request).read()
1937                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1938                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1939                         return
1940
1941                 # Extract media URL from playlist XML
1942                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1943                 if mobj is None:
1944                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1945                         return
1946                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1947                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1948
1949                 try:
1950                         # Process video information
1951                         self._downloader.process_info({
1952                                 'id':           video_id.decode('utf-8'),
1953                                 'url':          video_url,
1954                                 'uploader':     video_uploader,
1955                                 'upload_date':  u'NA',
1956                                 'title':        video_title,
1957                                 'stitle':       simple_title,
1958                                 'ext':          video_extension.decode('utf-8'),
1959                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1960                                 'description':  video_description,
1961                                 'thumbnail':    video_thumbnail,
1962                                 'player_url':   None,
1963                         })
1964                 except UnavailableVideoError:
1965                         self._downloader.trouble(u'\nERROR: unable to download video')
1966
1967
1968 class VimeoIE(InfoExtractor):
1969         """Information extractor for vimeo.com."""
1970
1971         # _VALID_URL matches Vimeo URLs
1972         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1973         IE_NAME = u'vimeo'
1974
1975         def __init__(self, downloader=None):
1976                 InfoExtractor.__init__(self, downloader)
1977
1978         def report_download_webpage(self, video_id):
1979                 """Report webpage download."""
1980                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1981
1982         def report_extraction(self, video_id):
1983                 """Report information extraction."""
1984                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1985
1986         def _real_extract(self, url, new_video=True):
1987                 # Extract ID from URL
1988                 mobj = re.match(self._VALID_URL, url)
1989                 if mobj is None:
1990                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1991                         return
1992
1993                 # At this point we have a new video
1994                 self._downloader.increment_downloads()
1995                 video_id = mobj.group(1)
1996
1997                 # Retrieve video webpage to extract further information
1998                 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1999                 try:
2000                         self.report_download_webpage(video_id)
2001                         webpage = urllib2.urlopen(request).read()
2002                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2003                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2004                         return
2005
2006                 # Now we begin extracting as much information as we can from what we
2007                 # retrieved. First we extract the information common to all extractors,
2008                 # and latter we extract those that are Vimeo specific.
2009                 self.report_extraction(video_id)
2010
2011                 # Extract title
2012                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2013                 if mobj is None:
2014                         self._downloader.trouble(u'ERROR: unable to extract video title')
2015                         return
2016                 video_title = mobj.group(1).decode('utf-8')
2017                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2018
2019                 # Extract uploader
2020                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2021                 if mobj is None:
2022                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2023                         return
2024                 video_uploader = mobj.group(1).decode('utf-8')
2025
2026                 # Extract video thumbnail
2027                 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2028                 if mobj is None:
2029                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2030                         return
2031                 video_thumbnail = mobj.group(1).decode('utf-8')
2032
2033                 # # Extract video description
2034                 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2035                 # if mobj is None:
2036                 #       self._downloader.trouble(u'ERROR: unable to extract video description')
2037                 #       return
2038                 # video_description = mobj.group(1).decode('utf-8')
2039                 # if not video_description: video_description = 'No description available.'
2040                 video_description = 'Foo.'
2041
2042                 # Vimeo specific: extract request signature
2043                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2044                 if mobj is None:
2045                         self._downloader.trouble(u'ERROR: unable to extract request signature')
2046                         return
2047                 sig = mobj.group(1).decode('utf-8')
2048
2049                 # Vimeo specific: extract video quality information
2050                 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2051                 if mobj is None:
2052                         self._downloader.trouble(u'ERROR: unable to extract video quality information')
2053                         return
2054                 quality = mobj.group(1).decode('utf-8')
2055
2056                 if int(quality) == 1:
2057                         quality = 'hd'
2058                 else:
2059                         quality = 'sd'
2060
2061                 # Vimeo specific: Extract request signature expiration
2062                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2063                 if mobj is None:
2064                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2065                         return
2066                 sig_exp = mobj.group(1).decode('utf-8')
2067
2068                 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2069
2070                 try:
2071                         # Process video information
2072                         self._downloader.process_info({
2073                                 'id':           video_id.decode('utf-8'),
2074                                 'url':          video_url,
2075                                 'uploader':     video_uploader,
2076                                 'upload_date':  u'NA',
2077                                 'title':        video_title,
2078                                 'stitle':       simple_title,
2079                                 'ext':          u'mp4',
2080                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2081                                 'description':  video_description,
2082                                 'thumbnail':    video_thumbnail,
2083                                 'description':  video_description,
2084                                 'player_url':   None,
2085                         })
2086                 except UnavailableVideoError:
2087                         self._downloader.trouble(u'ERROR: unable to download video')
2088
2089
2090 class GenericIE(InfoExtractor):
2091         """Generic last-resort information extractor."""
2092
2093         _VALID_URL = r'.*'
2094         IE_NAME = u'generic'
2095
2096         def __init__(self, downloader=None):
2097                 InfoExtractor.__init__(self, downloader)
2098
2099         def report_download_webpage(self, video_id):
2100                 """Report webpage download."""
2101                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2102                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2103
2104         def report_extraction(self, video_id):
2105                 """Report information extraction."""
2106                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2107
2108         def _real_extract(self, url):
2109                 # At this point we have a new video
2110                 self._downloader.increment_downloads()
2111
2112                 video_id = url.split('/')[-1]
2113                 request = urllib2.Request(url)
2114                 try:
2115                         self.report_download_webpage(video_id)
2116                         webpage = urllib2.urlopen(request).read()
2117                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2118                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2119                         return
2120                 except ValueError, err:
2121                         # since this is the last-resort InfoExtractor, if
2122                         # this error is thrown, it'll be thrown here
2123                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2124                         return
2125
2126                 self.report_extraction(video_id)
2127                 # Start with something easy: JW Player in SWFObject
2128                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2129                 if mobj is None:
2130                         # Broaden the search a little bit
2131                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2132                 if mobj is None:
2133                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2134                         return
2135
2136                 # It's possible that one of the regexes
2137                 # matched, but returned an empty group:
2138                 if mobj.group(1) is None:
2139                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2140                         return
2141
2142                 video_url = urllib.unquote(mobj.group(1))
2143                 video_id = os.path.basename(video_url)
2144
2145                 # here's a fun little line of code for you:
2146                 video_extension = os.path.splitext(video_id)[1][1:]
2147                 video_id = os.path.splitext(video_id)[0]
2148
2149                 # it's tempting to parse this further, but you would
2150                 # have to take into account all the variations like
2151                 #   Video Title - Site Name
2152                 #   Site Name | Video Title
2153                 #   Video Title - Tagline | Site Name
2154                 # and so on and so forth; it's just not practical
2155                 mobj = re.search(r'<title>(.*)</title>', webpage)
2156                 if mobj is None:
2157                         self._downloader.trouble(u'ERROR: unable to extract title')
2158                         return
2159                 video_title = mobj.group(1).decode('utf-8')
2160                 video_title = sanitize_title(video_title)
2161                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2162
2163                 # video uploader is domain name
2164                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2165                 if mobj is None:
2166                         self._downloader.trouble(u'ERROR: unable to extract title')
2167                         return
2168                 video_uploader = mobj.group(1).decode('utf-8')
2169
2170                 try:
2171                         # Process video information
2172                         self._downloader.process_info({
2173                                 'id':           video_id.decode('utf-8'),
2174                                 'url':          video_url.decode('utf-8'),
2175                                 'uploader':     video_uploader,
2176                                 'upload_date':  u'NA',
2177                                 'title':        video_title,
2178                                 'stitle':       simple_title,
2179                                 'ext':          video_extension.decode('utf-8'),
2180                                 'format':       u'NA',
2181                                 'player_url':   None,
2182                         })
2183                 except UnavailableVideoError, err:
2184                         self._downloader.trouble(u'\nERROR: unable to download video')
2185
2186
2187 class YoutubeSearchIE(InfoExtractor):
2188         """Information Extractor for YouTube search queries."""
2189         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2190         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2191         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2192         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2193         _youtube_ie = None
2194         _max_youtube_results = 1000
2195         IE_NAME = u'youtube:search'
2196
2197         def __init__(self, youtube_ie, downloader=None):
2198                 InfoExtractor.__init__(self, downloader)
2199                 self._youtube_ie = youtube_ie
2200
2201         def report_download_page(self, query, pagenum):
2202                 """Report attempt to download playlist page with given number."""
2203                 query = query.decode(preferredencoding())
2204                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2205
2206         def _real_initialize(self):
2207                 self._youtube_ie.initialize()
2208
2209         def _real_extract(self, query):
2210                 mobj = re.match(self._VALID_URL, query)
2211                 if mobj is None:
2212                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2213                         return
2214
2215                 prefix, query = query.split(':')
2216                 prefix = prefix[8:]
2217                 query = query.encode('utf-8')
2218                 if prefix == '':
2219                         self._download_n_results(query, 1)
2220                         return
2221                 elif prefix == 'all':
2222                         self._download_n_results(query, self._max_youtube_results)
2223                         return
2224                 else:
2225                         try:
2226                                 n = long(prefix)
2227                                 if n <= 0:
2228                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2229                                         return
2230                                 elif n > self._max_youtube_results:
2231                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2232                                         n = self._max_youtube_results
2233                                 self._download_n_results(query, n)
2234                                 return
2235                         except ValueError: # parsing prefix as integer fails
2236                                 self._download_n_results(query, 1)
2237                                 return
2238
2239         def _download_n_results(self, query, n):
2240                 """Downloads a specified number of results for a query"""
2241
2242                 video_ids = []
2243                 already_seen = set()
2244                 pagenum = 1
2245
2246                 while True:
2247                         self.report_download_page(query, pagenum)
2248                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2249                         request = urllib2.Request(result_url)
2250                         try:
2251                                 page = urllib2.urlopen(request).read()
2252                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2253                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2254                                 return
2255
2256                         # Extract video identifiers
2257                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2258                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2259                                 if video_id not in already_seen:
2260                                         video_ids.append(video_id)
2261                                         already_seen.add(video_id)
2262                                         if len(video_ids) == n:
2263                                                 # Specified n videos reached
2264                                                 for id in video_ids:
2265                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2266                                                 return
2267
2268                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2269                                 for id in video_ids:
2270                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2271                                 return
2272
2273                         pagenum = pagenum + 1
2274
2275
2276 class GoogleSearchIE(InfoExtractor):
2277         """Information Extractor for Google Video search queries."""
2278         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2279         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2280         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2281         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2282         _google_ie = None
2283         _max_google_results = 1000
2284         IE_NAME = u'video.google:search'
2285
2286         def __init__(self, google_ie, downloader=None):
2287                 InfoExtractor.__init__(self, downloader)
2288                 self._google_ie = google_ie
2289
2290         def report_download_page(self, query, pagenum):
2291                 """Report attempt to download playlist page with given number."""
2292                 query = query.decode(preferredencoding())
2293                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2294
2295         def _real_initialize(self):
2296                 self._google_ie.initialize()
2297
2298         def _real_extract(self, query):
2299                 mobj = re.match(self._VALID_URL, query)
2300                 if mobj is None:
2301                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2302                         return
2303
2304                 prefix, query = query.split(':')
2305                 prefix = prefix[8:]
2306                 query = query.encode('utf-8')
2307                 if prefix == '':
2308                         self._download_n_results(query, 1)
2309                         return
2310                 elif prefix == 'all':
2311                         self._download_n_results(query, self._max_google_results)
2312                         return
2313                 else:
2314                         try:
2315                                 n = long(prefix)
2316                                 if n <= 0:
2317                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2318                                         return
2319                                 elif n > self._max_google_results:
2320                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2321                                         n = self._max_google_results
2322                                 self._download_n_results(query, n)
2323                                 return
2324                         except ValueError: # parsing prefix as integer fails
2325                                 self._download_n_results(query, 1)
2326                                 return
2327
2328         def _download_n_results(self, query, n):
2329                 """Downloads a specified number of results for a query"""
2330
2331                 video_ids = []
2332                 already_seen = set()
2333                 pagenum = 1
2334
2335                 while True:
2336                         self.report_download_page(query, pagenum)
2337                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2338                         request = urllib2.Request(result_url)
2339                         try:
2340                                 page = urllib2.urlopen(request).read()
2341                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2342                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2343                                 return
2344
2345                         # Extract video identifiers
2346                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2347                                 video_id = mobj.group(1)
2348                                 if video_id not in already_seen:
2349                                         video_ids.append(video_id)
2350                                         already_seen.add(video_id)
2351                                         if len(video_ids) == n:
2352                                                 # Specified n videos reached
2353                                                 for id in video_ids:
2354                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2355                                                 return
2356
2357                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2358                                 for id in video_ids:
2359                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2360                                 return
2361
2362                         pagenum = pagenum + 1
2363
2364
2365 class YahooSearchIE(InfoExtractor):
2366         """Information Extractor for Yahoo! Video search queries."""
2367         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2368         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2369         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2370         _MORE_PAGES_INDICATOR = r'\s*Next'
2371         _yahoo_ie = None
2372         _max_yahoo_results = 1000
2373         IE_NAME = u'video.yahoo:search'
2374
2375         def __init__(self, yahoo_ie, downloader=None):
2376                 InfoExtractor.__init__(self, downloader)
2377                 self._yahoo_ie = yahoo_ie
2378
2379         def report_download_page(self, query, pagenum):
2380                 """Report attempt to download playlist page with given number."""
2381                 query = query.decode(preferredencoding())
2382                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2383
2384         def _real_initialize(self):
2385                 self._yahoo_ie.initialize()
2386
2387         def _real_extract(self, query):
2388                 mobj = re.match(self._VALID_URL, query)
2389                 if mobj is None:
2390                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2391                         return
2392
2393                 prefix, query = query.split(':')
2394                 prefix = prefix[8:]
2395                 query = query.encode('utf-8')
2396                 if prefix == '':
2397                         self._download_n_results(query, 1)
2398                         return
2399                 elif prefix == 'all':
2400                         self._download_n_results(query, self._max_yahoo_results)
2401                         return
2402                 else:
2403                         try:
2404                                 n = long(prefix)
2405                                 if n <= 0:
2406                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2407                                         return
2408                                 elif n > self._max_yahoo_results:
2409                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2410                                         n = self._max_yahoo_results
2411                                 self._download_n_results(query, n)
2412                                 return
2413                         except ValueError: # parsing prefix as integer fails
2414                                 self._download_n_results(query, 1)
2415                                 return
2416
2417         def _download_n_results(self, query, n):
2418                 """Downloads a specified number of results for a query"""
2419
2420                 video_ids = []
2421                 already_seen = set()
2422                 pagenum = 1
2423
2424                 while True:
2425                         self.report_download_page(query, pagenum)
2426                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2427                         request = urllib2.Request(result_url)
2428                         try:
2429                                 page = urllib2.urlopen(request).read()
2430                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2431                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2432                                 return
2433
2434                         # Extract video identifiers
2435                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2436                                 video_id = mobj.group(1)
2437                                 if video_id not in already_seen:
2438                                         video_ids.append(video_id)
2439                                         already_seen.add(video_id)
2440                                         if len(video_ids) == n:
2441                                                 # Specified n videos reached
2442                                                 for id in video_ids:
2443                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2444                                                 return
2445
2446                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2447                                 for id in video_ids:
2448                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2449                                 return
2450
2451                         pagenum = pagenum + 1
2452
2453
2454 class YoutubePlaylistIE(InfoExtractor):
2455         """Information Extractor for YouTube playlists."""
2456
2457         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2458         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2459         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2460         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2461         _youtube_ie = None
2462         IE_NAME = u'youtube:playlist'
2463
2464         def __init__(self, youtube_ie, downloader=None):
2465                 InfoExtractor.__init__(self, downloader)
2466                 self._youtube_ie = youtube_ie
2467
2468         def report_download_page(self, playlist_id, pagenum):
2469                 """Report attempt to download playlist page with given number."""
2470                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2471
2472         def _real_initialize(self):
2473                 self._youtube_ie.initialize()
2474
2475         def _real_extract(self, url):
2476                 # Extract playlist id
2477                 mobj = re.match(self._VALID_URL, url)
2478                 if mobj is None:
2479                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2480                         return
2481
2482                 # Single video case
2483                 if mobj.group(3) is not None:
2484                         self._youtube_ie.extract(mobj.group(3))
2485                         return
2486
2487                 # Download playlist pages
2488                 # prefix is 'p' as default for playlists but there are other types that need extra care
2489                 playlist_prefix = mobj.group(1)
2490                 if playlist_prefix == 'a':
2491                         playlist_access = 'artist'
2492                 else:
2493                         playlist_prefix = 'p'
2494                         playlist_access = 'view_play_list'
2495                 playlist_id = mobj.group(2)
2496                 video_ids = []
2497                 pagenum = 1
2498
2499                 while True:
2500                         self.report_download_page(playlist_id, pagenum)
2501                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2502                         request = urllib2.Request(url)
2503                         try:
2504                                 page = urllib2.urlopen(request).read()
2505                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2506                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2507                                 return
2508
2509                         # Extract video identifiers
2510                         ids_in_page = []
2511                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2512                                 if mobj.group(1) not in ids_in_page:
2513                                         ids_in_page.append(mobj.group(1))
2514                         video_ids.extend(ids_in_page)
2515
2516                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2517                                 break
2518                         pagenum = pagenum + 1
2519
2520                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2521                 playlistend = self._downloader.params.get('playlistend', -1)
2522                 video_ids = video_ids[playliststart:playlistend]
2523
2524                 for id in video_ids:
2525                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2526                 return
2527
2528
2529 class YoutubeUserIE(InfoExtractor):
2530         """Information Extractor for YouTube users."""
2531
2532         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2533         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2534         _GDATA_PAGE_SIZE = 50
2535         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2536         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2537         _youtube_ie = None
2538         IE_NAME = u'youtube:user'
2539
2540         def __init__(self, youtube_ie, downloader=None):
2541                 InfoExtractor.__init__(self, downloader)
2542                 self._youtube_ie = youtube_ie
2543
2544         def report_download_page(self, username, start_index):
2545                 """Report attempt to download user page."""
2546                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2547                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2548
2549         def _real_initialize(self):
2550                 self._youtube_ie.initialize()
2551
2552         def _real_extract(self, url):
2553                 # Extract username
2554                 mobj = re.match(self._VALID_URL, url)
2555                 if mobj is None:
2556                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2557                         return
2558
2559                 username = mobj.group(1)
2560
2561                 # Download video ids using YouTube Data API. Result size per
2562                 # query is limited (currently to 50 videos) so we need to query
2563                 # page by page until there are no video ids - it means we got
2564                 # all of them.
2565
2566                 video_ids = []
2567                 pagenum = 0
2568
2569                 while True:
2570                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2571                         self.report_download_page(username, start_index)
2572
2573                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2574
2575                         try:
2576                                 page = urllib2.urlopen(request).read()
2577                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2578                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2579                                 return
2580
2581                         # Extract video identifiers
2582                         ids_in_page = []
2583
2584                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2585                                 if mobj.group(1) not in ids_in_page:
2586                                         ids_in_page.append(mobj.group(1))
2587
2588                         video_ids.extend(ids_in_page)
2589
2590                         # A little optimization - if current page is not
2591                         # "full", ie. does not contain PAGE_SIZE video ids then
2592                         # we can assume that this page is the last one - there
2593                         # are no more ids on further pages - no need to query
2594                         # again.
2595
2596                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2597                                 break
2598
2599                         pagenum += 1
2600
2601                 all_ids_count = len(video_ids)
2602                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2603                 playlistend = self._downloader.params.get('playlistend', -1)
2604
2605                 if playlistend == -1:
2606                         video_ids = video_ids[playliststart:]
2607                 else:
2608                         video_ids = video_ids[playliststart:playlistend]
2609
2610                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2611                                 (username, all_ids_count, len(video_ids)))
2612
2613                 for video_id in video_ids:
2614                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2615
2616
2617 class DepositFilesIE(InfoExtractor):
2618         """Information extractor for depositfiles.com"""
2619
2620         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2621         IE_NAME = u'DepositFiles'
2622
2623         def __init__(self, downloader=None):
2624                 InfoExtractor.__init__(self, downloader)
2625
2626         def report_download_webpage(self, file_id):
2627                 """Report webpage download."""
2628                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2629
2630         def report_extraction(self, file_id):
2631                 """Report information extraction."""
2632                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2633
2634         def _real_extract(self, url):
2635                 # At this point we have a new file
2636                 self._downloader.increment_downloads()
2637
2638                 file_id = url.split('/')[-1]
2639                 # Rebuild url in english locale
2640                 url = 'http://depositfiles.com/en/files/' + file_id
2641
2642                 # Retrieve file webpage with 'Free download' button pressed
2643                 free_download_indication = { 'gateway_result' : '1' }
2644                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2645                 try:
2646                         self.report_download_webpage(file_id)
2647                         webpage = urllib2.urlopen(request).read()
2648                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2649                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2650                         return
2651
2652                 # Search for the real file URL
2653                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2654                 if (mobj is None) or (mobj.group(1) is None):
2655                         # Try to figure out reason of the error.
2656                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2657                         if (mobj is not None) and (mobj.group(1) is not None):
2658                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2659                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2660                         else:
2661                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2662                         return
2663
2664                 file_url = mobj.group(1)
2665                 file_extension = os.path.splitext(file_url)[1][1:]
2666
2667                 # Search for file title
2668                 mobj = re.search(r'<b title="(.*?)">', webpage)
2669                 if mobj is None:
2670                         self._downloader.trouble(u'ERROR: unable to extract title')
2671                         return
2672                 file_title = mobj.group(1).decode('utf-8')
2673
2674                 try:
2675                         # Process file information
2676                         self._downloader.process_info({
2677                                 'id':           file_id.decode('utf-8'),
2678                                 'url':          file_url.decode('utf-8'),
2679                                 'uploader':     u'NA',
2680                                 'upload_date':  u'NA',
2681                                 'title':        file_title,
2682                                 'stitle':       file_title,
2683                                 'ext':          file_extension.decode('utf-8'),
2684                                 'format':       u'NA',
2685                                 'player_url':   None,
2686                         })
2687                 except UnavailableVideoError, err:
2688                         self._downloader.trouble(u'ERROR: unable to download file')
2689
2690
2691 class FacebookIE(InfoExtractor):
2692         """Information Extractor for Facebook"""
2693
2694         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2695         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2696         _NETRC_MACHINE = 'facebook'
2697         _available_formats = ['video', 'highqual', 'lowqual']
2698         _video_extensions = {
2699                 'video': 'mp4',
2700                 'highqual': 'mp4',
2701                 'lowqual': 'mp4',
2702         }
2703         IE_NAME = u'facebook'
2704
2705         def __init__(self, downloader=None):
2706                 InfoExtractor.__init__(self, downloader)
2707
2708         def _reporter(self, message):
2709                 """Add header and report message."""
2710                 self._downloader.to_screen(u'[facebook] %s' % message)
2711
2712         def report_login(self):
2713                 """Report attempt to log in."""
2714                 self._reporter(u'Logging in')
2715
2716         def report_video_webpage_download(self, video_id):
2717                 """Report attempt to download video webpage."""
2718                 self._reporter(u'%s: Downloading video webpage' % video_id)
2719
2720         def report_information_extraction(self, video_id):
2721                 """Report attempt to extract video information."""
2722                 self._reporter(u'%s: Extracting video information' % video_id)
2723
2724         def _parse_page(self, video_webpage):
2725                 """Extract video information from page"""
2726                 # General data
2727                 data = {'title': r'\("video_title", "(.*?)"\)',
2728                         'description': r'<div class="datawrap">(.*?)</div>',
2729                         'owner': r'\("video_owner_name", "(.*?)"\)',
2730                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2731                         }
2732                 video_info = {}
2733                 for piece in data.keys():
2734                         mobj = re.search(data[piece], video_webpage)
2735                         if mobj is not None:
2736                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2737
2738                 # Video urls
2739                 video_urls = {}
2740                 for fmt in self._available_formats:
2741                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2742                         if mobj is not None:
2743                                 # URL is in a Javascript segment inside an escaped Unicode format within
2744                                 # the generally utf-8 page
2745                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2746                 video_info['video_urls'] = video_urls
2747
2748                 return video_info
2749
2750         def _real_initialize(self):
2751                 if self._downloader is None:
2752                         return
2753
2754                 useremail = None
2755                 password = None
2756                 downloader_params = self._downloader.params
2757
2758                 # Attempt to use provided username and password or .netrc data
2759                 if downloader_params.get('username', None) is not None:
2760                         useremail = downloader_params['username']
2761                         password = downloader_params['password']
2762                 elif downloader_params.get('usenetrc', False):
2763                         try:
2764                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2765                                 if info is not None:
2766                                         useremail = info[0]
2767                                         password = info[2]
2768                                 else:
2769                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2770                         except (IOError, netrc.NetrcParseError), err:
2771                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2772                                 return
2773
2774                 if useremail is None:
2775                         return
2776
2777                 # Log in
2778                 login_form = {
2779                         'email': useremail,
2780                         'pass': password,
2781                         'login': 'Log+In'
2782                         }
2783                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2784                 try:
2785                         self.report_login()
2786                         login_results = urllib2.urlopen(request).read()
2787                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2788                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2789                                 return
2790                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2791                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2792                         return
2793
2794         def _real_extract(self, url):
2795                 mobj = re.match(self._VALID_URL, url)
2796                 if mobj is None:
2797                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2798                         return
2799                 video_id = mobj.group('ID')
2800
2801                 # Get video webpage
2802                 self.report_video_webpage_download(video_id)
2803                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2804                 try:
2805                         page = urllib2.urlopen(request)
2806                         video_webpage = page.read()
2807                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2808                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2809                         return
2810
2811                 # Start extracting information
2812                 self.report_information_extraction(video_id)
2813
2814                 # Extract information
2815                 video_info = self._parse_page(video_webpage)
2816
2817                 # uploader
2818                 if 'owner' not in video_info:
2819                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2820                         return
2821                 video_uploader = video_info['owner']
2822
2823                 # title
2824                 if 'title' not in video_info:
2825                         self._downloader.trouble(u'ERROR: unable to extract video title')
2826                         return
2827                 video_title = video_info['title']
2828                 video_title = video_title.decode('utf-8')
2829                 video_title = sanitize_title(video_title)
2830
2831                 # simplified title
2832                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2833                 simple_title = simple_title.strip(ur'_')
2834
2835                 # thumbnail image
2836                 if 'thumbnail' not in video_info:
2837                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2838                         video_thumbnail = ''
2839                 else:
2840                         video_thumbnail = video_info['thumbnail']
2841
2842                 # upload date
2843                 upload_date = u'NA'
2844                 if 'upload_date' in video_info:
2845                         upload_time = video_info['upload_date']
2846                         timetuple = email.utils.parsedate_tz(upload_time)
2847                         if timetuple is not None:
2848                                 try:
2849                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2850                                 except:
2851                                         pass
2852
2853                 # description
2854                 video_description = video_info.get('description', 'No description available.')
2855
2856                 url_map = video_info['video_urls']
2857                 if len(url_map.keys()) > 0:
2858                         # Decide which formats to download
2859                         req_format = self._downloader.params.get('format', None)
2860                         format_limit = self._downloader.params.get('format_limit', None)
2861
2862                         if format_limit is not None and format_limit in self._available_formats:
2863                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2864                         else:
2865                                 format_list = self._available_formats
2866                         existing_formats = [x for x in format_list if x in url_map]
2867                         if len(existing_formats) == 0:
2868                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2869                                 return
2870                         if req_format is None:
2871                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2872                         elif req_format == 'worst':
2873                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2874                         elif req_format == '-1':
2875                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2876                         else:
2877                                 # Specific format
2878                                 if req_format not in url_map:
2879                                         self._downloader.trouble(u'ERROR: requested format not available')
2880                                         return
2881                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2882
2883                 for format_param, video_real_url in video_url_list:
2884
2885                         # At this point we have a new video
2886                         self._downloader.increment_downloads()
2887
2888                         # Extension
2889                         video_extension = self._video_extensions.get(format_param, 'mp4')
2890
2891                         try:
2892                                 # Process video information
2893                                 self._downloader.process_info({
2894                                         'id':           video_id.decode('utf-8'),
2895                                         'url':          video_real_url.decode('utf-8'),
2896                                         'uploader':     video_uploader.decode('utf-8'),
2897                                         'upload_date':  upload_date,
2898                                         'title':        video_title,
2899                                         'stitle':       simple_title,
2900                                         'ext':          video_extension.decode('utf-8'),
2901                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2902                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2903                                         'description':  video_description.decode('utf-8'),
2904                                         'player_url':   None,
2905                                 })
2906                         except UnavailableVideoError, err:
2907                                 self._downloader.trouble(u'\nERROR: unable to download video')
2908
2909 class BlipTVIE(InfoExtractor):
2910         """Information extractor for blip.tv"""
2911
2912         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2913         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2914         IE_NAME = u'blip.tv'
2915
2916         def report_extraction(self, file_id):
2917                 """Report information extraction."""
2918                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2919
2920         def report_direct_download(self, title):
2921                 """Report information extraction."""
2922                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2923
2924         def _simplify_title(self, title):
2925                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2926                 res = res.strip(ur'_')
2927                 return res
2928
2929         def _real_extract(self, url):
2930                 mobj = re.match(self._VALID_URL, url)
2931                 if mobj is None:
2932                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2933                         return
2934
2935                 if '?' in url:
2936                         cchar = '&'
2937                 else:
2938                         cchar = '?'
2939                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2940                 request = urllib2.Request(json_url)
2941                 self.report_extraction(mobj.group(1))
2942                 info = None
2943                 try:
2944                         urlh = urllib2.urlopen(request)
2945                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2946                                 basename = url.split('/')[-1]
2947                                 title,ext = os.path.splitext(basename)
2948                                 ext = ext.replace('.', '')
2949                                 self.report_direct_download(title)
2950                                 info = {
2951                                         'id': title,
2952                                         'url': url,
2953                                         'title': title,
2954                                         'stitle': self._simplify_title(title),
2955                                         'ext': ext,
2956                                         'urlhandle': urlh
2957                                 }
2958                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2959                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2960                         return
2961                 if info is None: # Regular URL
2962                         try:
2963                                 json_code = urlh.read()
2964                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2965                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2966                                 return
2967
2968                         try:
2969                                 json_data = json.loads(json_code)
2970                                 if 'Post' in json_data:
2971                                         data = json_data['Post']
2972                                 else:
2973                                         data = json_data
2974         
2975                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2976                                 video_url = data['media']['url']
2977                                 umobj = re.match(self._URL_EXT, video_url)
2978                                 if umobj is None:
2979                                         raise ValueError('Can not determine filename extension')
2980                                 ext = umobj.group(1)
2981         
2982                                 info = {
2983                                         'id': data['item_id'],
2984                                         'url': video_url,
2985                                         'uploader': data['display_name'],
2986                                         'upload_date': upload_date,
2987                                         'title': data['title'],
2988                                         'stitle': self._simplify_title(data['title']),
2989                                         'ext': ext,
2990                                         'format': data['media']['mimeType'],
2991                                         'thumbnail': data['thumbnailUrl'],
2992                                         'description': data['description'],
2993                                         'player_url': data['embedUrl']
2994                                 }
2995                         except (ValueError,KeyError), err:
2996                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2997                                 return
2998
2999                 self._downloader.increment_downloads()
3000
3001                 try:
3002                         self._downloader.process_info(info)
3003                 except UnavailableVideoError, err:
3004                         self._downloader.trouble(u'\nERROR: unable to download video')
3005
3006
3007 class MyVideoIE(InfoExtractor):
3008         """Information Extractor for myvideo.de."""
3009
3010         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3011         IE_NAME = u'myvideo'
3012
3013         def __init__(self, downloader=None):
3014                 InfoExtractor.__init__(self, downloader)
3015         
3016         def report_download_webpage(self, video_id):
3017                 """Report webpage download."""
3018                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3019
3020         def report_extraction(self, video_id):
3021                 """Report information extraction."""
3022                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3023
3024         def _real_extract(self,url):
3025                 mobj = re.match(self._VALID_URL, url)
3026                 if mobj is None:
3027                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3028                         return
3029
3030                 video_id = mobj.group(1)
3031                 simple_title = mobj.group(2).decode('utf-8')
3032                 # should actually not be necessary
3033                 simple_title = sanitize_title(simple_title)
3034                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3035
3036                 # Get video webpage
3037                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3038                 try:
3039                         self.report_download_webpage(video_id)
3040                         webpage = urllib2.urlopen(request).read()
3041                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3042                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3043                         return
3044
3045                 self.report_extraction(video_id)
3046                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3047                                  webpage)
3048                 if mobj is None:
3049                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3050                         return
3051                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3052
3053                 mobj = re.search('<title>([^<]+)</title>', webpage)
3054                 if mobj is None:
3055                         self._downloader.trouble(u'ERROR: unable to extract title')
3056                         return
3057
3058                 video_title = mobj.group(1)
3059                 video_title = sanitize_title(video_title)
3060
3061                 try:
3062                         self._downloader.process_info({
3063                                 'id':           video_id,
3064                                 'url':          video_url,
3065                                 'uploader':     u'NA',
3066                                 'upload_date':  u'NA',
3067                                 'title':        video_title,
3068                                 'stitle':       simple_title,
3069                                 'ext':          u'flv',
3070                                 'format':       u'NA',
3071                                 'player_url':   None,
3072                         })
3073                 except UnavailableVideoError:
3074                         self._downloader.trouble(u'\nERROR: Unable to download video')
3075
3076 class ComedyCentralIE(InfoExtractor):
3077         """Information extractor for The Daily Show and Colbert Report """
3078
3079         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3080         IE_NAME = u'comedycentral'
3081
3082         def report_extraction(self, episode_id):
3083                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3084         
3085         def report_config_download(self, episode_id):
3086                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3087
3088         def report_index_download(self, episode_id):
3089                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3090
3091         def report_player_url(self, episode_id):
3092                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3093
3094         def _simplify_title(self, title):
3095                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3096                 res = res.strip(ur'_')
3097                 return res
3098
3099         def _real_extract(self, url):
3100                 mobj = re.match(self._VALID_URL, url)
3101                 if mobj is None:
3102                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3103                         return
3104
3105                 if mobj.group('shortname'):
3106                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3107                                 url = 'http://www.thedailyshow.com/full-episodes/'
3108                         else:
3109                                 url = 'http://www.colbertnation.com/full-episodes/'
3110                         mobj = re.match(self._VALID_URL, url)
3111                         assert mobj is not None
3112
3113                 dlNewest = not mobj.group('episode')
3114                 if dlNewest:
3115                         epTitle = mobj.group('showname')
3116                 else:
3117                         epTitle = mobj.group('episode')
3118
3119                 req = urllib2.Request(url)
3120                 self.report_extraction(epTitle)
3121                 try:
3122                         htmlHandle = urllib2.urlopen(req)
3123                         html = htmlHandle.read()
3124                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3125                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3126                         return
3127                 if dlNewest:
3128                         url = htmlHandle.geturl()
3129                         mobj = re.match(self._VALID_URL, url)
3130                         if mobj is None:
3131                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3132                                 return
3133                         if mobj.group('episode') == '':
3134                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3135                                 return
3136                         epTitle = mobj.group('episode')
3137
3138                 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3139                 if len(mMovieParams) == 0:
3140                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3141                         return
3142
3143                 playerUrl_raw = mMovieParams[0][0]
3144                 self.report_player_url(epTitle)
3145                 try:
3146                         urlHandle = urllib2.urlopen(playerUrl_raw)
3147                         playerUrl = urlHandle.geturl()
3148                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3149                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3150                         return
3151
3152                 uri = mMovieParams[0][1]
3153                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3154                 self.report_index_download(epTitle)
3155                 try:
3156                         indexXml = urllib2.urlopen(indexUrl).read()
3157                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3158                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3159                         return
3160
3161                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3162                 itemEls = idoc.findall('.//item')
3163                 for itemEl in itemEls:
3164                         mediaId = itemEl.findall('./guid')[0].text
3165                         shortMediaId = mediaId.split(':')[-1]
3166                         showId = mediaId.split(':')[-2].replace('.com', '')
3167                         officialTitle = itemEl.findall('./title')[0].text
3168                         officialDate = itemEl.findall('./pubDate')[0].text
3169
3170                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3171                                                 urllib.urlencode({'uri': mediaId}))
3172                         configReq = urllib2.Request(configUrl)
3173                         self.report_config_download(epTitle)
3174                         try:
3175                                 configXml = urllib2.urlopen(configReq).read()
3176                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3177                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3178                                 return
3179
3180                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3181                         turls = []
3182                         for rendition in cdoc.findall('.//rendition'):
3183                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3184                                 turls.append(finfo)
3185
3186                         if len(turls) == 0:
3187                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3188                                 continue
3189
3190                         # For now, just pick the highest bitrate
3191                         format,video_url = turls[-1]
3192
3193                         self._downloader.increment_downloads()
3194
3195                         effTitle = showId + '-' + epTitle
3196                         info = {
3197                                 'id': shortMediaId,
3198                                 'url': video_url,
3199                                 'uploader': showId,
3200                                 'upload_date': officialDate,
3201                                 'title': effTitle,
3202                                 'stitle': self._simplify_title(effTitle),
3203                                 'ext': 'mp4',
3204                                 'format': format,
3205                                 'thumbnail': None,
3206                                 'description': officialTitle,
3207                                 'player_url': playerUrl
3208                         }
3209
3210                         try:
3211                                 self._downloader.process_info(info)
3212                         except UnavailableVideoError, err:
3213                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3214                                 continue
3215
3216
3217 class EscapistIE(InfoExtractor):
3218         """Information extractor for The Escapist """
3219
3220         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3221         IE_NAME = u'escapist'
3222
3223         def report_extraction(self, showName):
3224                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3225
3226         def report_config_download(self, showName):
3227                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3228
3229         def _simplify_title(self, title):
3230                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3231                 res = res.strip(ur'_')
3232                 return res
3233
3234         def _real_extract(self, url):
3235                 htmlParser = HTMLParser.HTMLParser()
3236
3237                 mobj = re.match(self._VALID_URL, url)
3238                 if mobj is None:
3239                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3240                         return
3241                 showName = mobj.group('showname')
3242                 videoId = mobj.group('episode')
3243
3244                 self.report_extraction(showName)
3245                 try:
3246                         webPage = urllib2.urlopen(url).read()
3247                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3248                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3249                         return
3250
3251                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3252                 description = htmlParser.unescape(descMatch.group(1))
3253                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3254                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3255                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3256                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3257                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3258                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3259
3260                 self.report_config_download(showName)
3261                 try:
3262                         configJSON = urllib2.urlopen(configUrl).read()
3263                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3264                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3265                         return
3266
3267                 # Technically, it's JavaScript, not JSON
3268                 configJSON = configJSON.replace("'", '"')
3269
3270                 try:
3271                         config = json.loads(configJSON)
3272                 except (ValueError,), err:
3273                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3274                         return
3275
3276                 playlist = config['playlist']
3277                 videoUrl = playlist[1]['url']
3278
3279                 self._downloader.increment_downloads()
3280                 info = {
3281                         'id': videoId,
3282                         'url': videoUrl,
3283                         'uploader': showName,
3284                         'upload_date': None,
3285                         'title': showName,
3286                         'stitle': self._simplify_title(showName),
3287                         'ext': 'flv',
3288                         'format': 'flv',
3289                         'thumbnail': imgUrl,
3290                         'description': description,
3291                         'player_url': playerUrl,
3292                 }
3293
3294                 try:
3295                         self._downloader.process_info(info)
3296                 except UnavailableVideoError, err:
3297                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3298
3299
3300 class CollegeHumorIE(InfoExtractor):
3301         """Information extractor for collegehumor.com"""
3302
3303         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3304         IE_NAME = u'collegehumor'
3305
3306         def report_webpage(self, video_id):
3307                 """Report information extraction."""
3308                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3309
3310         def report_extraction(self, video_id):
3311                 """Report information extraction."""
3312                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3313
3314         def _simplify_title(self, title):
3315                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3316                 res = res.strip(ur'_')
3317                 return res
3318
3319         def _real_extract(self, url):
3320                 htmlParser = HTMLParser.HTMLParser()
3321
3322                 mobj = re.match(self._VALID_URL, url)
3323                 if mobj is None:
3324                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3325                         return
3326                 video_id = mobj.group('videoid')
3327
3328                 self.report_webpage(video_id)
3329                 request = urllib2.Request(url)
3330                 try:
3331                         webpage = urllib2.urlopen(request).read()
3332                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3333                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3334                         return
3335
3336                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3337                 if m is None:
3338                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3339                         return
3340                 internal_video_id = m.group('internalvideoid')
3341
3342                 info = {
3343                         'id': video_id,
3344                         'internal_id': internal_video_id,
3345                 }
3346
3347                 self.report_extraction(video_id)
3348                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3349                 try:
3350                         metaXml = urllib2.urlopen(xmlUrl).read()
3351                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3352                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3353                         return
3354
3355                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3356                 try:
3357                         videoNode = mdoc.findall('./video')[0]
3358                         info['description'] = videoNode.findall('./description')[0].text
3359                         info['title'] = videoNode.findall('./caption')[0].text
3360                         info['stitle'] = self._simplify_title(info['title'])
3361                         info['url'] = videoNode.findall('./file')[0].text
3362                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3363                         info['ext'] = info['url'].rpartition('.')[2]
3364                         info['format'] = info['ext']
3365                 except IndexError:
3366                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3367                         return
3368
3369                 self._downloader.increment_downloads()
3370
3371                 try:
3372                         self._downloader.process_info(info)
3373                 except UnavailableVideoError, err:
3374                         self._downloader.trouble(u'\nERROR: unable to download video')
3375
3376
3377 class XVideosIE(InfoExtractor):
3378         """Information extractor for xvideos.com"""
3379
3380         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3381         IE_NAME = u'xvideos'
3382
3383         def report_webpage(self, video_id):
3384                 """Report information extraction."""
3385                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3386
3387         def report_extraction(self, video_id):
3388                 """Report information extraction."""
3389                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3390
3391         def _simplify_title(self, title):
3392                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3393                 res = res.strip(ur'_')
3394                 return res
3395
3396         def _real_extract(self, url):
3397                 htmlParser = HTMLParser.HTMLParser()
3398
3399                 mobj = re.match(self._VALID_URL, url)
3400                 if mobj is None:
3401                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3402                         return
3403                 video_id = mobj.group(1).decode('utf-8')
3404
3405                 self.report_webpage(video_id)
3406
3407                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3408                 try:
3409                         webpage = urllib2.urlopen(request).read()
3410                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3411                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3412                         return
3413
3414                 self.report_extraction(video_id)
3415
3416
3417                 # Extract video URL
3418                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3419                 if mobj is None:
3420                         self._downloader.trouble(u'ERROR: unable to extract video url')
3421                         return
3422                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3423
3424
3425                 # Extract title
3426                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3427                 if mobj is None:
3428                         self._downloader.trouble(u'ERROR: unable to extract video title')
3429                         return
3430                 video_title = mobj.group(1).decode('utf-8')
3431
3432
3433                 # Extract video thumbnail
3434                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3435                 if mobj is None:
3436                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3437                         return
3438                 video_thumbnail = mobj.group(1).decode('utf-8')
3439
3440
3441
3442                 self._downloader.increment_downloads()
3443                 info = {
3444                         'id': video_id,
3445                         'url': video_url,
3446                         'uploader': None,
3447                         'upload_date': None,
3448                         'title': video_title,
3449                         'stitle': self._simplify_title(video_title),
3450                         'ext': 'flv',
3451                         'format': 'flv',
3452                         'thumbnail': video_thumbnail,
3453                         'description': None,
3454                         'player_url': None,
3455                 }
3456
3457                 try:
3458                         self._downloader.process_info(info)
3459                 except UnavailableVideoError, err:
3460                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3461
3462
3463 class SoundcloudIE(InfoExtractor):
3464         """Information extractor for soundcloud.com
3465            To access the media, the uid of the song and a stream token
3466            must be extracted from the page source and the script must make
3467            a request to media.soundcloud.com/crossdomain.xml. Then
3468            the media can be grabbed by requesting from an url composed
3469            of the stream token and uid
3470          """
3471
3472         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3473         IE_NAME = u'soundcloud'
3474
3475         def __init__(self, downloader=None):
3476                 InfoExtractor.__init__(self, downloader)
3477
3478         def report_webpage(self, video_id):
3479                 """Report information extraction."""
3480                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3481
3482         def report_extraction(self, video_id):
3483                 """Report information extraction."""
3484                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3485
3486         def _real_extract(self, url):
3487                 htmlParser = HTMLParser.HTMLParser()
3488
3489                 mobj = re.match(self._VALID_URL, url)
3490                 if mobj is None:
3491                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3492                         return
3493
3494                 # extract uploader (which is in the url)
3495                 uploader = mobj.group(1).decode('utf-8')
3496                 # extract simple title (uploader + slug of song title)
3497                 slug_title =  mobj.group(2).decode('utf-8')
3498                 simple_title = uploader + '-' + slug_title
3499
3500                 self.report_webpage('%s/%s' % (uploader, slug_title))
3501
3502                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3503                 try:
3504                         webpage = urllib2.urlopen(request).read()
3505                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3506                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3507                         return
3508
3509                 self.report_extraction('%s/%s' % (uploader, slug_title))
3510
3511                 # extract uid and stream token that soundcloud hands out for access
3512                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3513                 if mobj:
3514                         video_id = mobj.group(1)
3515                         stream_token = mobj.group(2)
3516
3517                 # extract unsimplified title
3518                 mobj = re.search('"title":"(.*?)",', webpage)
3519                 if mobj:
3520                         title = mobj.group(1)
3521
3522                 # construct media url (with uid/token)
3523                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3524                 mediaURL = mediaURL % (video_id, stream_token)
3525
3526                 # description
3527                 description = u'No description available'
3528                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3529                 if mobj:
3530                         description = mobj.group(1)
3531                 
3532                 # upload date
3533                 upload_date = None
3534                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3535                 if mobj:
3536                         try:
3537                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3538                         except Exception as e:
3539                                 print str(e)
3540
3541                 # for soundcloud, a request to a cross domain is required for cookies
3542                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3543
3544                 try:
3545                         self._downloader.process_info({
3546                                 'id':           video_id.decode('utf-8'),
3547                                 'url':          mediaURL,
3548                                 'uploader':     uploader.decode('utf-8'),
3549                                 'upload_date':  upload_date,
3550                                 'title':        simple_title.decode('utf-8'),
3551                                 'stitle':       simple_title.decode('utf-8'),
3552                                 'ext':          u'mp3',
3553                                 'format':       u'NA',
3554                                 'player_url':   None,
3555                                 'description': description.decode('utf-8')
3556                         })
3557                 except UnavailableVideoError:
3558                         self._downloader.trouble(u'\nERROR: unable to download video')
3559
3560
3561 class InfoQIE(InfoExtractor):
3562         """Information extractor for infoq.com"""
3563
3564         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3565         IE_NAME = u'infoq'
3566
3567         def report_webpage(self, video_id):
3568                 """Report information extraction."""
3569                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3570
3571         def report_extraction(self, video_id):
3572                 """Report information extraction."""
3573                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3574
3575         def _simplify_title(self, title):
3576                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3577                 res = res.strip(ur'_')
3578                 return res
3579
3580         def _real_extract(self, url):
3581                 htmlParser = HTMLParser.HTMLParser()
3582
3583                 mobj = re.match(self._VALID_URL, url)
3584                 if mobj is None:
3585                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3586                         return
3587
3588                 self.report_webpage(url)
3589
3590                 request = urllib2.Request(url)
3591                 try:
3592                         webpage = urllib2.urlopen(request).read()
3593                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3594                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3595                         return
3596
3597                 self.report_extraction(url)
3598
3599
3600                 # Extract video URL
3601                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3602                 if mobj is None:
3603                         self._downloader.trouble(u'ERROR: unable to extract video url')
3604                         return
3605                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3606
3607
3608                 # Extract title
3609                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3610                 if mobj is None:
3611                         self._downloader.trouble(u'ERROR: unable to extract video title')
3612                         return
3613                 video_title = mobj.group(1).decode('utf-8')
3614
3615                 # Extract description
3616                 video_description = u'No description available.'
3617                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3618                 if mobj is not None:
3619                         video_description = mobj.group(1).decode('utf-8')
3620
3621                 video_filename = video_url.split('/')[-1]
3622                 video_id, extension = video_filename.split('.')
3623
3624                 self._downloader.increment_downloads()
3625                 info = {
3626                         'id': video_id,
3627                         'url': video_url,
3628                         'uploader': None,
3629                         'upload_date': None,
3630                         'title': video_title,
3631                         'stitle': self._simplify_title(video_title),
3632                         'ext': extension,
3633                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3634                         'thumbnail': None,
3635                         'description': video_description,
3636                         'player_url': None,
3637                 }
3638
3639                 try:
3640                         self._downloader.process_info(info)
3641                 except UnavailableVideoError, err:
3642                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3643
3644
3645
3646 class PostProcessor(object):
3647         """Post Processor class.
3648
3649         PostProcessor objects can be added to downloaders with their
3650         add_post_processor() method. When the downloader has finished a
3651         successful download, it will take its internal chain of PostProcessors
3652         and start calling the run() method on each one of them, first with
3653         an initial argument and then with the returned value of the previous
3654         PostProcessor.
3655
3656         The chain will be stopped if one of them ever returns None or the end
3657         of the chain is reached.
3658
3659         PostProcessor objects follow a "mutual registration" process similar
3660         to InfoExtractor objects.
3661         """
3662
3663         _downloader = None
3664
3665         def __init__(self, downloader=None):
3666                 self._downloader = downloader
3667
3668         def set_downloader(self, downloader):
3669                 """Sets the downloader for this PP."""
3670                 self._downloader = downloader
3671
3672         def run(self, information):
3673                 """Run the PostProcessor.
3674
3675                 The "information" argument is a dictionary like the ones
3676                 composed by InfoExtractors. The only difference is that this
3677                 one has an extra field called "filepath" that points to the
3678                 downloaded file.
3679
3680                 When this method returns None, the postprocessing chain is
3681                 stopped. However, this method may return an information
3682                 dictionary that will be passed to the next postprocessing
3683                 object in the chain. It can be the one it received after
3684                 changing some fields.
3685
3686                 In addition, this method may raise a PostProcessingError
3687                 exception that will be taken into account by the downloader
3688                 it was called from.
3689                 """
3690                 return information # by default, do nothing
3691
3692
3693 class FFmpegExtractAudioPP(PostProcessor):
3694
3695         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3696                 PostProcessor.__init__(self, downloader)
3697                 if preferredcodec is None:
3698                         preferredcodec = 'best'
3699                 self._preferredcodec = preferredcodec
3700                 self._preferredquality = preferredquality
3701                 self._keepvideo = keepvideo
3702
3703         @staticmethod
3704         def get_audio_codec(path):
3705                 try:
3706                         cmd = ['ffprobe', '-show_streams', '--', path]
3707                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3708                         output = handle.communicate()[0]
3709                         if handle.wait() != 0:
3710                                 return None
3711                 except (IOError, OSError):
3712                         return None
3713                 audio_codec = None
3714                 for line in output.split('\n'):
3715                         if line.startswith('codec_name='):
3716                                 audio_codec = line.split('=')[1].strip()
3717                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3718                                 return audio_codec
3719                 return None
3720
3721         @staticmethod
3722         def run_ffmpeg(path, out_path, codec, more_opts):
3723                 try:
3724                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3725                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3726                         return (ret == 0)
3727                 except (IOError, OSError):
3728                         return False
3729
3730         def run(self, information):
3731                 path = information['filepath']
3732
3733                 filecodec = self.get_audio_codec(path)
3734                 if filecodec is None:
3735                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3736                         return None
3737
3738                 more_opts = []
3739                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3740                         if filecodec in ['aac', 'mp3', 'vorbis']:
3741                                 # Lossless if possible
3742                                 acodec = 'copy'
3743                                 extension = filecodec
3744                                 if filecodec == 'aac':
3745                                         more_opts = ['-f', 'adts']
3746                                 if filecodec == 'vorbis':
3747                                         extension = 'ogg'
3748                         else:
3749                                 # MP3 otherwise.
3750                                 acodec = 'libmp3lame'
3751                                 extension = 'mp3'
3752                                 more_opts = []
3753                                 if self._preferredquality is not None:
3754                                         more_opts += ['-ab', self._preferredquality]
3755                 else:
3756                         # We convert the audio (lossy)
3757                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3758                         extension = self._preferredcodec
3759                         more_opts = []
3760                         if self._preferredquality is not None:
3761                                 more_opts += ['-ab', self._preferredquality]
3762                         if self._preferredcodec == 'aac':
3763                                 more_opts += ['-f', 'adts']
3764                         if self._preferredcodec == 'vorbis':
3765                                 extension = 'ogg'
3766
3767                 (prefix, ext) = os.path.splitext(path)
3768                 new_path = prefix + '.' + extension
3769                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3770                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3771
3772                 if not status:
3773                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3774                         return None
3775
3776                 # Try to update the date time for extracted audio file.
3777                 if information.get('filetime') is not None:
3778                         try:
3779                                 os.utime(new_path, (time.time(), information['filetime']))
3780                         except:
3781                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3782
3783                 if not self._keepvideo:
3784                         try:
3785                                 os.remove(path)
3786                         except (IOError, OSError):
3787                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3788                                 return None
3789
3790                 information['filepath'] = new_path
3791                 return information
3792
3793
3794 def updateSelf(downloader, filename):
3795         ''' Update the program file with the latest version from the repository '''
3796         # Note: downloader only used for options
3797         if not os.access(filename, os.W_OK):
3798                 sys.exit('ERROR: no write permissions on %s' % filename)
3799
3800         downloader.to_screen('Updating to latest version...')
3801
3802         try:
3803                 try:
3804                         urlh = urllib.urlopen(UPDATE_URL)
3805                         newcontent = urlh.read()
3806                         
3807                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
3808                         if vmatch is not None and vmatch.group(1) == __version__:
3809                                 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3810                                 return
3811                 finally:
3812                         urlh.close()
3813         except (IOError, OSError), err:
3814                 sys.exit('ERROR: unable to download latest version')
3815
3816         try:
3817                 outf = open(filename, 'wb')
3818                 try:
3819                         outf.write(newcontent)
3820                 finally:
3821                         outf.close()
3822         except (IOError, OSError), err:
3823                 sys.exit('ERROR: unable to overwrite current version')
3824
3825         downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3826
3827 def parseOpts():
3828         # Deferred imports
3829         import getpass
3830         import optparse
3831
3832         def _format_option_string(option):
3833                 ''' ('-o', '--option') -> -o, --format METAVAR'''
3834
3835                 opts = []
3836
3837                 if option._short_opts: opts.append(option._short_opts[0])
3838                 if option._long_opts: opts.append(option._long_opts[0])
3839                 if len(opts) > 1: opts.insert(1, ', ')
3840
3841                 if option.takes_value(): opts.append(' %s' % option.metavar)
3842
3843                 return "".join(opts)
3844
3845         def _find_term_columns():
3846                 columns = os.environ.get('COLUMNS', None)
3847                 if columns:
3848                         return int(columns)
3849
3850                 try:
3851                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3852                         out,err = sp.communicate()
3853                         return int(out.split()[1])
3854                 except:
3855                         pass
3856                 return None
3857
3858         max_width = 80
3859         max_help_position = 80
3860
3861         # No need to wrap help messages if we're on a wide console
3862         columns = _find_term_columns()
3863         if columns: max_width = columns
3864
3865         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3866         fmt.format_option_strings = _format_option_string
3867
3868         kw = {
3869                 'version'   : __version__,
3870                 'formatter' : fmt,
3871                 'usage' : '%prog [options] url [url...]',
3872                 'conflict_handler' : 'resolve',
3873         }
3874
3875         parser = optparse.OptionParser(**kw)
3876
3877         # option groups
3878         general        = optparse.OptionGroup(parser, 'General Options')
3879         selection      = optparse.OptionGroup(parser, 'Video Selection')
3880         authentication = optparse.OptionGroup(parser, 'Authentication Options')
3881         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
3882         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
3883         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
3884         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3885
3886         general.add_option('-h', '--help',
3887                         action='help', help='print this help text and exit')
3888         general.add_option('-v', '--version',
3889                         action='version', help='print program version and exit')
3890         general.add_option('-U', '--update',
3891                         action='store_true', dest='update_self', help='update this program to latest version')
3892         general.add_option('-i', '--ignore-errors',
3893                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3894         general.add_option('-r', '--rate-limit',
3895                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3896         general.add_option('-R', '--retries',
3897                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3898         general.add_option('--dump-user-agent',
3899                         action='store_true', dest='dump_user_agent',
3900                         help='display the current browser identification', default=False)
3901         general.add_option('--list-extractors',
3902                         action='store_true', dest='list_extractors',
3903                         help='List all supported extractors and the URLs they would handle', default=False)
3904
3905         selection.add_option('--playlist-start',
3906                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3907         selection.add_option('--playlist-end',
3908                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3909         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3910         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3911
3912         authentication.add_option('-u', '--username',
3913                         dest='username', metavar='USERNAME', help='account username')
3914         authentication.add_option('-p', '--password',
3915                         dest='password', metavar='PASSWORD', help='account password')
3916         authentication.add_option('-n', '--netrc',
3917                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3918
3919
3920         video_format.add_option('-f', '--format',
3921                         action='store', dest='format', metavar='FORMAT', help='video format code')
3922         video_format.add_option('--all-formats',
3923                         action='store_const', dest='format', help='download all available video formats', const='all')
3924         video_format.add_option('--max-quality',
3925                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3926         video_format.add_option('-F', '--list-formats',
3927                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
3928
3929
3930         verbosity.add_option('-q', '--quiet',
3931                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
3932         verbosity.add_option('-s', '--simulate',
3933                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3934         verbosity.add_option('--skip-download',
3935                         action='store_true', dest='skip_download', help='do not download the video', default=False)
3936         verbosity.add_option('-g', '--get-url',
3937                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3938         verbosity.add_option('-e', '--get-title',
3939                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3940         verbosity.add_option('--get-thumbnail',
3941                         action='store_true', dest='getthumbnail',
3942                         help='simulate, quiet but print thumbnail URL', default=False)
3943         verbosity.add_option('--get-description',
3944                         action='store_true', dest='getdescription',
3945                         help='simulate, quiet but print video description', default=False)
3946         verbosity.add_option('--get-filename',
3947                         action='store_true', dest='getfilename',
3948                         help='simulate, quiet but print output filename', default=False)
3949         verbosity.add_option('--get-format',
3950                         action='store_true', dest='getformat',
3951                         help='simulate, quiet but print output format', default=False)
3952         verbosity.add_option('--no-progress',
3953                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3954         verbosity.add_option('--console-title',
3955                         action='store_true', dest='consoletitle',
3956                         help='display progress in console titlebar', default=False)
3957
3958
3959         filesystem.add_option('-t', '--title',
3960                         action='store_true', dest='usetitle', help='use title in file name', default=False)
3961         filesystem.add_option('-l', '--literal',
3962                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3963         filesystem.add_option('-A', '--auto-number',
3964                         action='store_true', dest='autonumber',
3965                         help='number downloaded files starting from 00000', default=False)
3966         filesystem.add_option('-o', '--output',
3967                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
3968         filesystem.add_option('-a', '--batch-file',
3969                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3970         filesystem.add_option('-w', '--no-overwrites',
3971                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3972         filesystem.add_option('-c', '--continue',
3973                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3974         filesystem.add_option('--no-continue',
3975                         action='store_false', dest='continue_dl',
3976                         help='do not resume partially downloaded files (restart from beginning)')
3977         filesystem.add_option('--cookies',
3978                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
3979         filesystem.add_option('--no-part',
3980                         action='store_true', dest='nopart', help='do not use .part files', default=False)
3981         filesystem.add_option('--no-mtime',
3982                         action='store_false', dest='updatetime',
3983                         help='do not use the Last-modified header to set the file modification time', default=True)
3984         filesystem.add_option('--write-description',
3985                         action='store_true', dest='writedescription',
3986                         help='write video description to a .description file', default=False)
3987         filesystem.add_option('--write-info-json',
3988                         action='store_true', dest='writeinfojson',
3989                         help='write video metadata to a .info.json file', default=False)
3990
3991
3992         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3993                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3994         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3995                         help='"best", "aac", "vorbis" or "mp3"; best by default')
3996         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3997                         help='ffmpeg audio bitrate specification, 128k by default')
3998         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3999                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4000
4001
4002         parser.add_option_group(general)
4003         parser.add_option_group(selection)
4004         parser.add_option_group(filesystem)
4005         parser.add_option_group(verbosity)
4006         parser.add_option_group(video_format)
4007         parser.add_option_group(authentication)
4008         parser.add_option_group(postproc)
4009
4010         opts, args = parser.parse_args()
4011
4012         return parser, opts, args
4013
4014 def gen_extractors():
4015         """ Return a list of an instance of every supported extractor.
4016         The order does matter; the first extractor matched is the one handling the URL.
4017         """
4018         youtube_ie = YoutubeIE()
4019         google_ie = GoogleIE()
4020         yahoo_ie = YahooIE()
4021         return [
4022                 YoutubePlaylistIE(youtube_ie),
4023                 YoutubeUserIE(youtube_ie),
4024                 YoutubeSearchIE(youtube_ie),
4025                 youtube_ie,
4026                 MetacafeIE(youtube_ie),
4027                 DailymotionIE(),
4028                 google_ie,
4029                 GoogleSearchIE(google_ie),
4030                 PhotobucketIE(),
4031                 yahoo_ie,
4032                 YahooSearchIE(yahoo_ie),
4033                 DepositFilesIE(),
4034                 FacebookIE(),
4035                 BlipTVIE(),
4036                 VimeoIE(),
4037                 MyVideoIE(),
4038                 ComedyCentralIE(),
4039                 EscapistIE(),
4040                 CollegeHumorIE(),
4041                 XVideosIE(),
4042                 SoundcloudIE(),
4043                 InfoQIE(),
4044
4045                 GenericIE()
4046         ]
4047
4048 def main():
4049         parser, opts, args = parseOpts()
4050
4051         # Open appropriate CookieJar
4052         if opts.cookiefile is None:
4053                 jar = cookielib.CookieJar()
4054         else:
4055                 try:
4056                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4057                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4058                                 jar.load()
4059                 except (IOError, OSError), err:
4060                         sys.exit(u'ERROR: unable to open cookie file')
4061
4062         # Dump user agent
4063         if opts.dump_user_agent:
4064                 print std_headers['User-Agent']
4065                 sys.exit(0)
4066
4067         # Batch file verification
4068         batchurls = []
4069         if opts.batchfile is not None:
4070                 try:
4071                         if opts.batchfile == '-':
4072                                 batchfd = sys.stdin
4073                         else:
4074                                 batchfd = open(opts.batchfile, 'r')
4075                         batchurls = batchfd.readlines()
4076                         batchurls = [x.strip() for x in batchurls]
4077                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4078                 except IOError:
4079                         sys.exit(u'ERROR: batch file could not be read')
4080         all_urls = batchurls + args
4081
4082         # General configuration
4083         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4084         opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4085         urllib2.install_opener(opener)
4086         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4087
4088         extractors = gen_extractors()
4089
4090         if opts.list_extractors:
4091                 for ie in extractors:
4092                         print(ie.IE_NAME)
4093                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4094                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4095                         for mu in matchedUrls:
4096                                 print(u'  ' + mu)
4097                 sys.exit(0)
4098
4099         # Conflicting, missing and erroneous options
4100         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4101                 parser.error(u'using .netrc conflicts with giving username/password')
4102         if opts.password is not None and opts.username is None:
4103                 parser.error(u'account username missing')
4104         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4105                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4106         if opts.usetitle and opts.useliteral:
4107                 parser.error(u'using title conflicts with using literal title')
4108         if opts.username is not None and opts.password is None:
4109                 opts.password = getpass.getpass(u'Type account password and press return:')
4110         if opts.ratelimit is not None:
4111                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4112                 if numeric_limit is None:
4113                         parser.error(u'invalid rate limit specified')
4114                 opts.ratelimit = numeric_limit
4115         if opts.retries is not None:
4116                 try:
4117                         opts.retries = long(opts.retries)
4118                 except (TypeError, ValueError), err:
4119                         parser.error(u'invalid retry count specified')
4120         try:
4121                 opts.playliststart = int(opts.playliststart)
4122                 if opts.playliststart <= 0:
4123                         raise ValueError(u'Playlist start must be positive')
4124         except (TypeError, ValueError), err:
4125                 parser.error(u'invalid playlist start number specified')
4126         try:
4127                 opts.playlistend = int(opts.playlistend)
4128                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4129                         raise ValueError(u'Playlist end must be greater than playlist start')
4130         except (TypeError, ValueError), err:
4131                 parser.error(u'invalid playlist end number specified')
4132         if opts.extractaudio:
4133                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
4134                         parser.error(u'invalid audio format specified')
4135
4136         # File downloader
4137         fd = FileDownloader({
4138                 'usenetrc': opts.usenetrc,
4139                 'username': opts.username,
4140                 'password': opts.password,
4141                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4142                 'forceurl': opts.geturl,
4143                 'forcetitle': opts.gettitle,
4144                 'forcethumbnail': opts.getthumbnail,
4145                 'forcedescription': opts.getdescription,
4146                 'forcefilename': opts.getfilename,
4147                 'forceformat': opts.getformat,
4148                 'simulate': opts.simulate,
4149                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4150                 'format': opts.format,
4151                 'format_limit': opts.format_limit,
4152                 'listformats': opts.listformats,
4153                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4154                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4155                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4156                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4157                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4158                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4159                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4160                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4161                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4162                         or u'%(id)s.%(ext)s'),
4163                 'ignoreerrors': opts.ignoreerrors,
4164                 'ratelimit': opts.ratelimit,
4165                 'nooverwrites': opts.nooverwrites,
4166                 'retries': opts.retries,
4167                 'continuedl': opts.continue_dl,
4168                 'noprogress': opts.noprogress,
4169                 'playliststart': opts.playliststart,
4170                 'playlistend': opts.playlistend,
4171                 'logtostderr': opts.outtmpl == '-',
4172                 'consoletitle': opts.consoletitle,
4173                 'nopart': opts.nopart,
4174                 'updatetime': opts.updatetime,
4175                 'writedescription': opts.writedescription,
4176                 'writeinfojson': opts.writeinfojson,
4177                 'matchtitle': opts.matchtitle,
4178                 'rejecttitle': opts.rejecttitle,
4179                 })
4180         for extractor in extractors:
4181                 fd.add_info_extractor(extractor)
4182
4183         # PostProcessors
4184         if opts.extractaudio:
4185                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4186
4187         # Update version
4188         if opts.update_self:
4189                 updateSelf(fd, sys.argv[0])
4190
4191         # Maybe do nothing
4192         if len(all_urls) < 1:
4193                 if not opts.update_self:
4194                         parser.error(u'you must provide at least one URL')
4195                 else:
4196                         sys.exit()
4197         retcode = fd.download(all_urls)
4198
4199         # Dump cookie jar if requested
4200         if opts.cookiefile is not None:
4201                 try:
4202                         jar.save()
4203                 except (IOError, OSError), err:
4204                         sys.exit(u'ERROR: unable to save cookie jar')
4205
4206         sys.exit(retcode)
4207
4208
4209 if __name__ == '__main__':
4210         try:
4211                 main()
4212         except DownloadError:
4213                 sys.exit(1)
4214         except SameFileError:
4215                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4216         except KeyboardInterrupt:
4217                 sys.exit(u'\nERROR: Interrupted by user')
4218
4219 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: