youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import email.utils
  10 import errno
  11 import getpass
  12 import gzip
  13 import itertools
  14 import io
  15 import json
  16 import locale
  17 import math
  18 import os
  19 import pipes
  20 import platform
  21 import re
  22 import ssl
  23 import socket
  24 import struct
  25 import subprocess
  26 import sys
  27 import traceback
  28 import xml.etree.ElementTree
  29 import zlib
  30
  31 try:
  32     import urllib.request as compat_urllib_request
  33 except ImportError: # Python 2
  34     import urllib2 as compat_urllib_request
  35
  36 try:
  37     import urllib.error as compat_urllib_error
  38 except ImportError: # Python 2
  39     import urllib2 as compat_urllib_error
  40
  41 try:
  42     import urllib.parse as compat_urllib_parse
  43 except ImportError: # Python 2
  44     import urllib as compat_urllib_parse
  45
  46 try:
  47     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  48 except ImportError: # Python 2
  49     from urlparse import urlparse as compat_urllib_parse_urlparse
  50
  51 try:
  52     import urllib.parse as compat_urlparse
  53 except ImportError: # Python 2
  54     import urlparse as compat_urlparse
  55
  56 try:
  57     import http.cookiejar as compat_cookiejar
  58 except ImportError: # Python 2
  59     import cookielib as compat_cookiejar
  60
  61 try:
  62     import html.entities as compat_html_entities
  63 except ImportError: # Python 2
  64     import htmlentitydefs as compat_html_entities
  65
  66 try:
  67     import html.parser as compat_html_parser
  68 except ImportError: # Python 2
  69     import HTMLParser as compat_html_parser
  70
  71 try:
  72     import http.client as compat_http_client
  73 except ImportError: # Python 2
  74     import httplib as compat_http_client
  75
  76 try:
  77     from urllib.error import HTTPError as compat_HTTPError
  78 except ImportError:  # Python 2
  79     from urllib2 import HTTPError as compat_HTTPError
  80
  81 try:
  82     from urllib.request import urlretrieve as compat_urlretrieve
  83 except ImportError:  # Python 2
  84     from urllib import urlretrieve as compat_urlretrieve
  85
  86
  87 try:
  88     from subprocess import DEVNULL
  89     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  90 except ImportError:
  91     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  92
  93 try:
  94     from urllib.parse import parse_qs as compat_parse_qs
  95 except ImportError: # Python 2
  96     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  97     # Python 2's version is apparently totally broken
  98     def _unquote(string, encoding='utf-8', errors='replace'):
  99         if string == '':
 100             return string
 101         res = string.split('%')
 102         if len(res) == 1:
 103             return string
 104         if encoding is None:
 105             encoding = 'utf-8'
 106         if errors is None:
 107             errors = 'replace'
 108         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 109         pct_sequence = b''
 110         string = res[0]
 111         for item in res[1:]:
 112             try:
 113                 if not item:
 114                     raise ValueError
 115                 pct_sequence += item[:2].decode('hex')
 116                 rest = item[2:]
 117                 if not rest:
 118                     # This segment was just a single percent-encoded character.
 119                     # May be part of a sequence of code units, so delay decoding.
 120                     # (Stored in pct_sequence).
 121                     continue
 122             except ValueError:
 123                 rest = '%' + item
 124             # Encountered non-percent-encoded characters. Flush the current
 125             # pct_sequence.
 126             string += pct_sequence.decode(encoding, errors) + rest
 127             pct_sequence = b''
 128         if pct_sequence:
 129             # Flush the final pct_sequence
 130             string += pct_sequence.decode(encoding, errors)
 131         return string
 132
 133     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 134                 encoding='utf-8', errors='replace'):
 135         qs, _coerce_result = qs, unicode
 136         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 137         r = []
 138         for name_value in pairs:
 139             if not name_value and not strict_parsing:
 140                 continue
 141             nv = name_value.split('=', 1)
 142             if len(nv) != 2:
 143                 if strict_parsing:
 144                     raise ValueError("bad query field: %r" % (name_value,))
 145                 # Handle case of a control-name with no equal sign
 146                 if keep_blank_values:
 147                     nv.append('')
 148                 else:
 149                     continue
 150             if len(nv[1]) or keep_blank_values:
 151                 name = nv[0].replace('+', ' ')
 152                 name = _unquote(name, encoding=encoding, errors=errors)
 153                 name = _coerce_result(name)
 154                 value = nv[1].replace('+', ' ')
 155                 value = _unquote(value, encoding=encoding, errors=errors)
 156                 value = _coerce_result(value)
 157                 r.append((name, value))
 158         return r
 159
 160     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 161                 encoding='utf-8', errors='replace'):
 162         parsed_result = {}
 163         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 164                         encoding=encoding, errors=errors)
 165         for name, value in pairs:
 166             if name in parsed_result:
 167                 parsed_result[name].append(value)
 168             else:
 169                 parsed_result[name] = [value]
 170         return parsed_result
 171
 172 try:
 173     compat_str = unicode # Python 2
 174 except NameError:
 175     compat_str = str
 176
 177 try:
 178     compat_chr = unichr # Python 2
 179 except NameError:
 180     compat_chr = chr
 181
 182 try:
 183     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 184 except ImportError:  # Python 2.6
 185     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 186
 187 def compat_ord(c):
 188     if type(c) is int: return c
 189     else: return ord(c)
 190
 191 # This is not clearly defined otherwise
 192 compiled_regex_type = type(re.compile(''))
 193
 194 std_headers = {
 195     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 196     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 197     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 198     'Accept-Encoding': 'gzip, deflate',
 199     'Accept-Language': 'en-us,en;q=0.5',
 200 }
 201
 202 def preferredencoding():
 203     """Get preferred encoding.
 204
 205     Returns the best encoding scheme for the system, based on
 206     locale.getpreferredencoding() and some further tweaks.
 207     """
 208     try:
 209         pref = locale.getpreferredencoding()
 210         u'TEST'.encode(pref)
 211     except:
 212         pref = 'UTF-8'
 213
 214     return pref
 215
 216 if sys.version_info < (3,0):
 217     def compat_print(s):
 218         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 219 else:
 220     def compat_print(s):
 221         assert type(s) == type(u'')
 222         print(s)
 223
 224 # In Python 2.x, json.dump expects a bytestream.
 225 # In Python 3.x, it writes to a character stream
 226 if sys.version_info < (3,0):
 227     def write_json_file(obj, fn):
 228         with open(fn, 'wb') as f:
 229             json.dump(obj, f)
 230 else:
 231     def write_json_file(obj, fn):
 232         with open(fn, 'w', encoding='utf-8') as f:
 233             json.dump(obj, f)
 234
 235 if sys.version_info >= (2,7):
 236     def find_xpath_attr(node, xpath, key, val):
 237         """ Find the xpath xpath[@key=val] """
 238         assert re.match(r'^[a-zA-Z]+$', key)
 239         assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
 240         expr = xpath + u"[@%s='%s']" % (key, val)
 241         return node.find(expr)
 242 else:
 243     def find_xpath_attr(node, xpath, key, val):
 244         for f in node.findall(xpath):
 245             if f.attrib.get(key) == val:
 246                 return f
 247         return None
 248
 249 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 250 # the namespace parameter
 251 def xpath_with_ns(path, ns_map):
 252     components = [c.split(':') for c in path.split('/')]
 253     replaced = []
 254     for c in components:
 255         if len(c) == 1:
 256             replaced.append(c[0])
 257         else:
 258             ns, tag = c
 259             replaced.append('{%s}%s' % (ns_map[ns], tag))
 260     return '/'.join(replaced)
 261
 262 def htmlentity_transform(matchobj):
 263     """Transforms an HTML entity to a character.
 264
 265     This function receives a match object and is intended to be used with
 266     the re.sub() function.
 267     """
 268     entity = matchobj.group(1)
 269
 270     # Known non-numeric HTML entity
 271     if entity in compat_html_entities.name2codepoint:
 272         return compat_chr(compat_html_entities.name2codepoint[entity])
 273
 274     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 275     if mobj is not None:
 276         numstr = mobj.group(1)
 277         if numstr.startswith(u'x'):
 278             base = 16
 279             numstr = u'0%s' % numstr
 280         else:
 281             base = 10
 282         return compat_chr(int(numstr, base))
 283
 284     # Unknown entity in name, return its literal representation
 285     return (u'&%s;' % entity)
 286
 287 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 288 class BaseHTMLParser(compat_html_parser.HTMLParser):
 289     def __init(self):
 290         compat_html_parser.HTMLParser.__init__(self)
 291         self.html = None
 292
 293     def loads(self, html):
 294         self.html = html
 295         self.feed(html)
 296         self.close()
 297
 298 class AttrParser(BaseHTMLParser):
 299     """Modified HTMLParser that isolates a tag with the specified attribute"""
 300     def __init__(self, attribute, value):
 301         self.attribute = attribute
 302         self.value = value
 303         self.result = None
 304         self.started = False
 305         self.depth = {}
 306         self.watch_startpos = False
 307         self.error_count = 0
 308         BaseHTMLParser.__init__(self)
 309
 310     def error(self, message):
 311         if self.error_count > 10 or self.started:
 312             raise compat_html_parser.HTMLParseError(message, self.getpos())
 313         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 314         self.error_count += 1
 315         self.goahead(1)
 316
 317     def handle_starttag(self, tag, attrs):
 318         attrs = dict(attrs)
 319         if self.started:
 320             self.find_startpos(None)
 321         if self.attribute in attrs and attrs[self.attribute] == self.value:
 322             self.result = [tag]
 323             self.started = True
 324             self.watch_startpos = True
 325         if self.started:
 326             if not tag in self.depth: self.depth[tag] = 0
 327             self.depth[tag] += 1
 328
 329     def handle_endtag(self, tag):
 330         if self.started:
 331             if tag in self.depth: self.depth[tag] -= 1
 332             if self.depth[self.result[0]] == 0:
 333                 self.started = False
 334                 self.result.append(self.getpos())
 335
 336     def find_startpos(self, x):
 337         """Needed to put the start position of the result (self.result[1])
 338         after the opening tag with the requested id"""
 339         if self.watch_startpos:
 340             self.watch_startpos = False
 341             self.result.append(self.getpos())
 342     handle_entityref = handle_charref = handle_data = handle_comment = \
 343     handle_decl = handle_pi = unknown_decl = find_startpos
 344
 345     def get_result(self):
 346         if self.result is None:
 347             return None
 348         if len(self.result) != 3:
 349             return None
 350         lines = self.html.split('\n')
 351         lines = lines[self.result[1][0]-1:self.result[2][0]]
 352         lines[0] = lines[0][self.result[1][1]:]
 353         if len(lines) == 1:
 354             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 355         lines[-1] = lines[-1][:self.result[2][1]]
 356         return '\n'.join(lines).strip()
 357 # Hack for https://github.com/rg3/youtube-dl/issues/662
 358 if sys.version_info < (2, 7, 3):
 359     AttrParser.parse_endtag = (lambda self, i:
 360         i + len("</scr'+'ipt>")
 361         if self.rawdata[i:].startswith("</scr'+'ipt>")
 362         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 363
 364 def get_element_by_id(id, html):
 365     """Return the content of the tag with the specified ID in the passed HTML document"""
 366     return get_element_by_attribute("id", id, html)
 367
 368 def get_element_by_attribute(attribute, value, html):
 369     """Return the content of the tag with the specified attribute in the passed HTML document"""
 370     parser = AttrParser(attribute, value)
 371     try:
 372         parser.loads(html)
 373     except compat_html_parser.HTMLParseError:
 374         pass
 375     return parser.get_result()
 376
 377 class MetaParser(BaseHTMLParser):
 378     """
 379     Modified HTMLParser that isolates a meta tag with the specified name
 380     attribute.
 381     """
 382     def __init__(self, name):
 383         BaseHTMLParser.__init__(self)
 384         self.name = name
 385         self.content = None
 386         self.result = None
 387
 388     def handle_starttag(self, tag, attrs):
 389         if tag != 'meta':
 390             return
 391         attrs = dict(attrs)
 392         if attrs.get('name') == self.name:
 393             self.result = attrs.get('content')
 394
 395     def get_result(self):
 396         return self.result
 397
 398 def get_meta_content(name, html):
 399     """
 400     Return the content attribute from the meta tag with the given name attribute.
 401     """
 402     parser = MetaParser(name)
 403     try:
 404         parser.loads(html)
 405     except compat_html_parser.HTMLParseError:
 406         pass
 407     return parser.get_result()
 408
 409
 410 def clean_html(html):
 411     """Clean an HTML snippet into a readable string"""
 412     # Newline vs <br />
 413     html = html.replace('\n', ' ')
 414     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 415     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 416     # Strip html tags
 417     html = re.sub('<.*?>', '', html)
 418     # Replace html entities
 419     html = unescapeHTML(html)
 420     return html.strip()
 421
 422
 423 def sanitize_open(filename, open_mode):
 424     """Try to open the given filename, and slightly tweak it if this fails.
 425
 426     Attempts to open the given filename. If this fails, it tries to change
 427     the filename slightly, step by step, until it's either able to open it
 428     or it fails and raises a final exception, like the standard open()
 429     function.
 430
 431     It returns the tuple (stream, definitive_file_name).
 432     """
 433     try:
 434         if filename == u'-':
 435             if sys.platform == 'win32':
 436                 import msvcrt
 437                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 438             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 439         stream = open(encodeFilename(filename), open_mode)
 440         return (stream, filename)
 441     except (IOError, OSError) as err:
 442         if err.errno in (errno.EACCES,):
 443             raise
 444
 445         # In case of error, try to remove win32 forbidden chars
 446         alt_filename = os.path.join(
 447                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 448                         for path_part in os.path.split(filename)
 449                        )
 450         if alt_filename == filename:
 451             raise
 452         else:
 453             # An exception here should be caught in the caller
 454             stream = open(encodeFilename(filename), open_mode)
 455             return (stream, alt_filename)
 456
 457
 458 def timeconvert(timestr):
 459     """Convert RFC 2822 defined time string into system timestamp"""
 460     timestamp = None
 461     timetuple = email.utils.parsedate_tz(timestr)
 462     if timetuple is not None:
 463         timestamp = email.utils.mktime_tz(timetuple)
 464     return timestamp
 465
 466 def sanitize_filename(s, restricted=False, is_id=False):
 467     """Sanitizes a string so it could be used as part of a filename.
 468     If restricted is set, use a stricter subset of allowed characters.
 469     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 470     """
 471     def replace_insane(char):
 472         if char == '?' or ord(char) < 32 or ord(char) == 127:
 473             return ''
 474         elif char == '"':
 475             return '' if restricted else '\''
 476         elif char == ':':
 477             return '_-' if restricted else ' -'
 478         elif char in '\\/|*<>':
 479             return '_'
 480         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 481             return '_'
 482         if restricted and ord(char) > 127:
 483             return '_'
 484         return char
 485
 486     result = u''.join(map(replace_insane, s))
 487     if not is_id:
 488         while '__' in result:
 489             result = result.replace('__', '_')
 490         result = result.strip('_')
 491         # Common case of "Foreign band name - English song title"
 492         if restricted and result.startswith('-_'):
 493             result = result[2:]
 494         if not result:
 495             result = '_'
 496     return result
 497
 498 def orderedSet(iterable):
 499     """ Remove all duplicates from the input iterable """
 500     res = []
 501     for el in iterable:
 502         if el not in res:
 503             res.append(el)
 504     return res
 505
 506
 507 def unescapeHTML(s):
 508     if s is None:
 509         return None
 510     assert type(s) == compat_str
 511
 512     result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
 513     return result
 514
 515
 516 def encodeFilename(s, for_subprocess=False):
 517     """
 518     @param s The name of the file
 519     """
 520
 521     assert type(s) == compat_str
 522
 523     # Python 3 has a Unicode API
 524     if sys.version_info >= (3, 0):
 525         return s
 526
 527     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 528         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 529         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 530         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 531         if not for_subprocess:
 532             return s
 533         else:
 534             # For subprocess calls, encode with locale encoding
 535             # Refer to http://stackoverflow.com/a/9951851/35070
 536             encoding = preferredencoding()
 537     else:
 538         encoding = sys.getfilesystemencoding()
 539     if encoding is None:
 540         encoding = 'utf-8'
 541     return s.encode(encoding, 'ignore')
 542
 543 def decodeOption(optval):
 544     if optval is None:
 545         return optval
 546     if isinstance(optval, bytes):
 547         optval = optval.decode(preferredencoding())
 548
 549     assert isinstance(optval, compat_str)
 550     return optval
 551
 552 def formatSeconds(secs):
 553     if secs > 3600:
 554         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 555     elif secs > 60:
 556         return '%d:%02d' % (secs // 60, secs % 60)
 557     else:
 558         return '%d' % secs
 559
 560
 561 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 562     if sys.version_info < (3, 2):
 563         import httplib
 564
 565         class HTTPSConnectionV3(httplib.HTTPSConnection):
 566             def __init__(self, *args, **kwargs):
 567                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 568
 569             def connect(self):
 570                 sock = socket.create_connection((self.host, self.port), self.timeout)
 571                 if getattr(self, '_tunnel_host', False):
 572                     self.sock = sock
 573                     self._tunnel()
 574                 try:
 575                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 576                 except ssl.SSLError:
 577                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 578
 579         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 580             def https_open(self, req):
 581                 return self.do_open(HTTPSConnectionV3, req)
 582         return HTTPSHandlerV3(**kwargs)
 583     else:
 584         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 585         context.verify_mode = (ssl.CERT_NONE
 586                                if opts_no_check_certificate
 587                                else ssl.CERT_REQUIRED)
 588         context.set_default_verify_paths()
 589         try:
 590             context.load_default_certs()
 591         except AttributeError:
 592             pass  # Python < 3.4
 593         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 594
 595 class ExtractorError(Exception):
 596     """Error during info extraction."""
 597     def __init__(self, msg, tb=None, expected=False, cause=None):
 598         """ tb, if given, is the original traceback (so that it can be printed out).
 599         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 600         """
 601
 602         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 603             expected = True
 604         if not expected:
 605             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 606         super(ExtractorError, self).__init__(msg)
 607
 608         self.traceback = tb
 609         self.exc_info = sys.exc_info()  # preserve original exception
 610         self.cause = cause
 611
 612     def format_traceback(self):
 613         if self.traceback is None:
 614             return None
 615         return u''.join(traceback.format_tb(self.traceback))
 616
 617
 618 class RegexNotFoundError(ExtractorError):
 619     """Error when a regex didn't match"""
 620     pass
 621
 622
 623 class DownloadError(Exception):
 624     """Download Error exception.
 625
 626     This exception may be thrown by FileDownloader objects if they are not
 627     configured to continue on errors. They will contain the appropriate
 628     error message.
 629     """
 630     def __init__(self, msg, exc_info=None):
 631         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 632         super(DownloadError, self).__init__(msg)
 633         self.exc_info = exc_info
 634
 635
 636 class SameFileError(Exception):
 637     """Same File exception.
 638
 639     This exception will be thrown by FileDownloader objects if they detect
 640     multiple files would have to be downloaded to the same file on disk.
 641     """
 642     pass
 643
 644
 645 class PostProcessingError(Exception):
 646     """Post Processing exception.
 647
 648     This exception may be raised by PostProcessor's .run() method to
 649     indicate an error in the postprocessing task.
 650     """
 651     def __init__(self, msg):
 652         self.msg = msg
 653
 654 class MaxDownloadsReached(Exception):
 655     """ --max-downloads limit has been reached. """
 656     pass
 657
 658
 659 class UnavailableVideoError(Exception):
 660     """Unavailable Format exception.
 661
 662     This exception will be thrown when a video is requested
 663     in a format that is not available for that video.
 664     """
 665     pass
 666
 667
 668 class ContentTooShortError(Exception):
 669     """Content Too Short exception.
 670
 671     This exception may be raised by FileDownloader objects when a file they
 672     download is too small for what the server announced first, indicating
 673     the connection was probably interrupted.
 674     """
 675     # Both in bytes
 676     downloaded = None
 677     expected = None
 678
 679     def __init__(self, downloaded, expected):
 680         self.downloaded = downloaded
 681         self.expected = expected
 682
 683 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 684     """Handler for HTTP requests and responses.
 685
 686     This class, when installed with an OpenerDirector, automatically adds
 687     the standard headers to every HTTP request and handles gzipped and
 688     deflated responses from web servers. If compression is to be avoided in
 689     a particular request, the original request in the program code only has
 690     to include the HTTP header "Youtubedl-No-Compression", which will be
 691     removed before making the real request.
 692
 693     Part of this code was copied from:
 694
 695     http://techknack.net/python-urllib2-handlers/
 696
 697     Andrew Rowls, the author of that code, agreed to release it to the
 698     public domain.
 699     """
 700
 701     @staticmethod
 702     def deflate(data):
 703         try:
 704             return zlib.decompress(data, -zlib.MAX_WBITS)
 705         except zlib.error:
 706             return zlib.decompress(data)
 707
 708     @staticmethod
 709     def addinfourl_wrapper(stream, headers, url, code):
 710         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 711             return compat_urllib_request.addinfourl(stream, headers, url, code)
 712         ret = compat_urllib_request.addinfourl(stream, headers, url)
 713         ret.code = code
 714         return ret
 715
 716     def http_request(self, req):
 717         for h,v in std_headers.items():
 718             if h in req.headers:
 719                 del req.headers[h]
 720             req.add_header(h, v)
 721         if 'Youtubedl-no-compression' in req.headers:
 722             if 'Accept-encoding' in req.headers:
 723                 del req.headers['Accept-encoding']
 724             del req.headers['Youtubedl-no-compression']
 725         if 'Youtubedl-user-agent' in req.headers:
 726             if 'User-agent' in req.headers:
 727                 del req.headers['User-agent']
 728             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 729             del req.headers['Youtubedl-user-agent']
 730         return req
 731
 732     def http_response(self, req, resp):
 733         old_resp = resp
 734         # gzip
 735         if resp.headers.get('Content-encoding', '') == 'gzip':
 736             content = resp.read()
 737             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 738             try:
 739                 uncompressed = io.BytesIO(gz.read())
 740             except IOError as original_ioerror:
 741                 # There may be junk add the end of the file
 742                 # See http://stackoverflow.com/q/4928560/35070 for details
 743                 for i in range(1, 1024):
 744                     try:
 745                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 746                         uncompressed = io.BytesIO(gz.read())
 747                     except IOError:
 748                         continue
 749                     break
 750                 else:
 751                     raise original_ioerror
 752             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 753             resp.msg = old_resp.msg
 754         # deflate
 755         if resp.headers.get('Content-encoding', '') == 'deflate':
 756             gz = io.BytesIO(self.deflate(resp.read()))
 757             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 758             resp.msg = old_resp.msg
 759         return resp
 760
 761     https_request = http_request
 762     https_response = http_response
 763
 764
 765 def parse_iso8601(date_str):
 766     """ Return a UNIX timestamp from the given date """
 767
 768     if date_str is None:
 769         return None
 770
 771     m = re.search(
 772         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 773         date_str)
 774     if not m:
 775         timezone = datetime.timedelta()
 776     else:
 777         date_str = date_str[:-len(m.group(0))]
 778         if not m.group('sign'):
 779             timezone = datetime.timedelta()
 780         else:
 781             sign = 1 if m.group('sign') == '+' else -1
 782             timezone = datetime.timedelta(
 783                 hours=sign * int(m.group('hours')),
 784                 minutes=sign * int(m.group('minutes')))
 785
 786     dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone
 787     return calendar.timegm(dt.timetuple())
 788
 789
 790 def unified_strdate(date_str):
 791     """Return a string with the date in the format YYYYMMDD"""
 792
 793     if date_str is None:
 794         return None
 795
 796     upload_date = None
 797     #Replace commas
 798     date_str = date_str.replace(',', ' ')
 799     # %z (UTC offset) is only supported in python>=3.2
 800     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 801     format_expressions = [
 802         '%d %B %Y',
 803         '%d %b %Y',
 804         '%B %d %Y',
 805         '%b %d %Y',
 806         '%Y-%m-%d',
 807         '%d.%m.%Y',
 808         '%d/%m/%Y',
 809         '%Y/%m/%d %H:%M:%S',
 810         '%Y-%m-%d %H:%M:%S',
 811         '%d.%m.%Y %H:%M',
 812         '%d.%m.%Y %H.%M',
 813         '%Y-%m-%dT%H:%M:%SZ',
 814         '%Y-%m-%dT%H:%M:%S.%fZ',
 815         '%Y-%m-%dT%H:%M:%S.%f0Z',
 816         '%Y-%m-%dT%H:%M:%S',
 817         '%Y-%m-%dT%H:%M:%S.%f',
 818         '%Y-%m-%dT%H:%M',
 819     ]
 820     for expression in format_expressions:
 821         try:
 822             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 823         except ValueError:
 824             pass
 825     if upload_date is None:
 826         timetuple = email.utils.parsedate_tz(date_str)
 827         if timetuple:
 828             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 829     return upload_date
 830
 831 def determine_ext(url, default_ext=u'unknown_video'):
 832     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 833     if re.match(r'^[A-Za-z0-9]+$', guess):
 834         return guess
 835     else:
 836         return default_ext
 837
 838 def subtitles_filename(filename, sub_lang, sub_format):
 839     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 840
 841 def date_from_str(date_str):
 842     """
 843     Return a datetime object from a string in the format YYYYMMDD or
 844     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 845     today = datetime.date.today()
 846     if date_str == 'now'or date_str == 'today':
 847         return today
 848     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 849     if match is not None:
 850         sign = match.group('sign')
 851         time = int(match.group('time'))
 852         if sign == '-':
 853             time = -time
 854         unit = match.group('unit')
 855         #A bad aproximation?
 856         if unit == 'month':
 857             unit = 'day'
 858             time *= 30
 859         elif unit == 'year':
 860             unit = 'day'
 861             time *= 365
 862         unit += 's'
 863         delta = datetime.timedelta(**{unit: time})
 864         return today + delta
 865     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 866
 867 def hyphenate_date(date_str):
 868     """
 869     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 870     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 871     if match is not None:
 872         return '-'.join(match.groups())
 873     else:
 874         return date_str
 875
 876 class DateRange(object):
 877     """Represents a time interval between two dates"""
 878     def __init__(self, start=None, end=None):
 879         """start and end must be strings in the format accepted by date"""
 880         if start is not None:
 881             self.start = date_from_str(start)
 882         else:
 883             self.start = datetime.datetime.min.date()
 884         if end is not None:
 885             self.end = date_from_str(end)
 886         else:
 887             self.end = datetime.datetime.max.date()
 888         if self.start > self.end:
 889             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 890     @classmethod
 891     def day(cls, day):
 892         """Returns a range that only contains the given day"""
 893         return cls(day,day)
 894     def __contains__(self, date):
 895         """Check if the date is in the range"""
 896         if not isinstance(date, datetime.date):
 897             date = date_from_str(date)
 898         return self.start <= date <= self.end
 899     def __str__(self):
 900         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 901
 902
 903 def platform_name():
 904     """ Returns the platform name as a compat_str """
 905     res = platform.platform()
 906     if isinstance(res, bytes):
 907         res = res.decode(preferredencoding())
 908
 909     assert isinstance(res, compat_str)
 910     return res
 911
 912
 913 def _windows_write_string(s, out):
 914     """ Returns True if the string was written using special methods,
 915     False if it has yet to be written out."""
 916     # Adapted from http://stackoverflow.com/a/3259271/35070
 917
 918     import ctypes
 919     import ctypes.wintypes
 920
 921     WIN_OUTPUT_IDS = {
 922         1: -11,
 923         2: -12,
 924     }
 925
 926     def ucs2_len(s):
 927         return sum((2 if ord(c) > 0xffff else 1) for c in s)
 928
 929     fileno = out.fileno()
 930     if fileno not in WIN_OUTPUT_IDS:
 931         return False
 932
 933     GetStdHandle = ctypes.WINFUNCTYPE(
 934         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 935         ("GetStdHandle", ctypes.windll.kernel32))
 936     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 937
 938     WriteConsoleW = ctypes.WINFUNCTYPE(
 939         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 940         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 941         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
 942     written = ctypes.wintypes.DWORD(0)
 943
 944     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
 945     FILE_TYPE_CHAR = 0x0002
 946     FILE_TYPE_REMOTE = 0x8000
 947     GetConsoleMode = ctypes.WINFUNCTYPE(
 948         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 949         ctypes.POINTER(ctypes.wintypes.DWORD))(
 950         ("GetConsoleMode", ctypes.windll.kernel32))
 951     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 952
 953     def not_a_console(handle):
 954         if handle == INVALID_HANDLE_VALUE or handle is None:
 955             return True
 956         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 957                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 958
 959     if not_a_console(h):
 960         return False
 961
 962     remaining = ucs2_len(s)
 963     while remaining > 0:
 964         ret = WriteConsoleW(
 965             h, s, min(remaining, 1024), ctypes.byref(written), None)
 966         if ret == 0:
 967             raise OSError('Failed to write string')
 968         remaining -= written.value
 969     return True
 970
 971
 972 def write_string(s, out=None, encoding=None):
 973     if out is None:
 974         out = sys.stderr
 975     assert type(s) == compat_str
 976
 977     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 978         if _windows_write_string(s, out):
 979             return
 980
 981     if ('b' in getattr(out, 'mode', '') or
 982             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 983         byt = s.encode(encoding or preferredencoding(), 'ignore')
 984         out.write(byt)
 985     elif hasattr(out, 'buffer'):
 986         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 987         byt = s.encode(enc, 'ignore')
 988         out.buffer.write(byt)
 989     else:
 990         out.write(s)
 991     out.flush()
 992
 993
 994 def bytes_to_intlist(bs):
 995     if not bs:
 996         return []
 997     if isinstance(bs[0], int):  # Python 3
 998         return list(bs)
 999     else:
1000         return [ord(c) for c in bs]
1001
1002
1003 def intlist_to_bytes(xs):
1004     if not xs:
1005         return b''
1006     if isinstance(chr(0), bytes):  # Python 2
1007         return ''.join([chr(x) for x in xs])
1008     else:
1009         return bytes(xs)
1010
1011
1012 def get_cachedir(params={}):
1013     cache_root = os.environ.get('XDG_CACHE_HOME',
1014                                 os.path.expanduser('~/.cache'))
1015     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1016
1017
1018 # Cross-platform file locking
1019 if sys.platform == 'win32':
1020     import ctypes.wintypes
1021     import msvcrt
1022
1023     class OVERLAPPED(ctypes.Structure):
1024         _fields_ = [
1025             ('Internal', ctypes.wintypes.LPVOID),
1026             ('InternalHigh', ctypes.wintypes.LPVOID),
1027             ('Offset', ctypes.wintypes.DWORD),
1028             ('OffsetHigh', ctypes.wintypes.DWORD),
1029             ('hEvent', ctypes.wintypes.HANDLE),
1030         ]
1031
1032     kernel32 = ctypes.windll.kernel32
1033     LockFileEx = kernel32.LockFileEx
1034     LockFileEx.argtypes = [
1035         ctypes.wintypes.HANDLE,     # hFile
1036         ctypes.wintypes.DWORD,      # dwFlags
1037         ctypes.wintypes.DWORD,      # dwReserved
1038         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1039         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1040         ctypes.POINTER(OVERLAPPED)  # Overlapped
1041     ]
1042     LockFileEx.restype = ctypes.wintypes.BOOL
1043     UnlockFileEx = kernel32.UnlockFileEx
1044     UnlockFileEx.argtypes = [
1045         ctypes.wintypes.HANDLE,     # hFile
1046         ctypes.wintypes.DWORD,      # dwReserved
1047         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1048         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1049         ctypes.POINTER(OVERLAPPED)  # Overlapped
1050     ]
1051     UnlockFileEx.restype = ctypes.wintypes.BOOL
1052     whole_low = 0xffffffff
1053     whole_high = 0x7fffffff
1054
1055     def _lock_file(f, exclusive):
1056         overlapped = OVERLAPPED()
1057         overlapped.Offset = 0
1058         overlapped.OffsetHigh = 0
1059         overlapped.hEvent = 0
1060         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1061         handle = msvcrt.get_osfhandle(f.fileno())
1062         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1063                           whole_low, whole_high, f._lock_file_overlapped_p):
1064             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1065
1066     def _unlock_file(f):
1067         assert f._lock_file_overlapped_p
1068         handle = msvcrt.get_osfhandle(f.fileno())
1069         if not UnlockFileEx(handle, 0,
1070                             whole_low, whole_high, f._lock_file_overlapped_p):
1071             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1072
1073 else:
1074     import fcntl
1075
1076     def _lock_file(f, exclusive):
1077         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1078
1079     def _unlock_file(f):
1080         fcntl.lockf(f, fcntl.LOCK_UN)
1081
1082
1083 class locked_file(object):
1084     def __init__(self, filename, mode, encoding=None):
1085         assert mode in ['r', 'a', 'w']
1086         self.f = io.open(filename, mode, encoding=encoding)
1087         self.mode = mode
1088
1089     def __enter__(self):
1090         exclusive = self.mode != 'r'
1091         try:
1092             _lock_file(self.f, exclusive)
1093         except IOError:
1094             self.f.close()
1095             raise
1096         return self
1097
1098     def __exit__(self, etype, value, traceback):
1099         try:
1100             _unlock_file(self.f)
1101         finally:
1102             self.f.close()
1103
1104     def __iter__(self):
1105         return iter(self.f)
1106
1107     def write(self, *args):
1108         return self.f.write(*args)
1109
1110     def read(self, *args):
1111         return self.f.read(*args)
1112
1113
1114 def shell_quote(args):
1115     quoted_args = []
1116     encoding = sys.getfilesystemencoding()
1117     if encoding is None:
1118         encoding = 'utf-8'
1119     for a in args:
1120         if isinstance(a, bytes):
1121             # We may get a filename encoded with 'encodeFilename'
1122             a = a.decode(encoding)
1123         quoted_args.append(pipes.quote(a))
1124     return u' '.join(quoted_args)
1125
1126
1127 def takewhile_inclusive(pred, seq):
1128     """ Like itertools.takewhile, but include the latest evaluated element
1129         (the first element so that Not pred(e)) """
1130     for e in seq:
1131         yield e
1132         if not pred(e):
1133             return
1134
1135
1136 def smuggle_url(url, data):
1137     """ Pass additional data in a URL for internal use. """
1138
1139     sdata = compat_urllib_parse.urlencode(
1140         {u'__youtubedl_smuggle': json.dumps(data)})
1141     return url + u'#' + sdata
1142
1143
1144 def unsmuggle_url(smug_url, default=None):
1145     if not '#__youtubedl_smuggle' in smug_url:
1146         return smug_url, default
1147     url, _, sdata = smug_url.rpartition(u'#')
1148     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1149     data = json.loads(jsond)
1150     return url, data
1151
1152
1153 def format_bytes(bytes):
1154     if bytes is None:
1155         return u'N/A'
1156     if type(bytes) is str:
1157         bytes = float(bytes)
1158     if bytes == 0.0:
1159         exponent = 0
1160     else:
1161         exponent = int(math.log(bytes, 1024.0))
1162     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1163     converted = float(bytes) / float(1024 ** exponent)
1164     return u'%.2f%s' % (converted, suffix)
1165
1166
1167 def str_to_int(int_str):
1168     int_str = re.sub(r'[,\.]', u'', int_str)
1169     return int(int_str)
1170
1171
1172 def get_term_width():
1173     columns = os.environ.get('COLUMNS', None)
1174     if columns:
1175         return int(columns)
1176
1177     try:
1178         sp = subprocess.Popen(
1179             ['stty', 'size'],
1180             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1181         out, err = sp.communicate()
1182         return int(out.split()[1])
1183     except:
1184         pass
1185     return None
1186
1187
1188 def month_by_name(name):
1189     """ Return the number of a month by (locale-independently) English name """
1190
1191     ENGLISH_NAMES = [
1192         u'January', u'February', u'March', u'April', u'May', u'June',
1193         u'July', u'August', u'September', u'October', u'November', u'December']
1194     try:
1195         return ENGLISH_NAMES.index(name) + 1
1196     except ValueError:
1197         return None
1198
1199
1200 def fix_xml_ampersands(xml_str):
1201     """Replace all the '&' by '&amp;' in XML"""
1202     return re.sub(
1203         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1204         u'&amp;',
1205         xml_str)
1206
1207
1208 def setproctitle(title):
1209     assert isinstance(title, compat_str)
1210     try:
1211         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1212     except OSError:
1213         return
1214     title_bytes = title.encode('utf-8')
1215     buf = ctypes.create_string_buffer(len(title_bytes))
1216     buf.value = title_bytes
1217     try:
1218         libc.prctl(15, buf, 0, 0, 0)
1219     except AttributeError:
1220         return  # Strange libc, just skip this
1221
1222
1223 def remove_start(s, start):
1224     if s.startswith(start):
1225         return s[len(start):]
1226     return s
1227
1228
1229 def url_basename(url):
1230     path = compat_urlparse.urlparse(url).path
1231     return path.strip(u'/').split(u'/')[-1]
1232
1233
1234 class HEADRequest(compat_urllib_request.Request):
1235     def get_method(self):
1236         return "HEAD"
1237
1238
1239 def int_or_none(v, scale=1, default=None):
1240     return default if v is None else (int(v) // scale)
1241
1242
1243 def float_or_none(v, scale=1, default=None):
1244     return default if v is None else (float(v) / scale)
1245
1246
1247 def parse_duration(s):
1248     if s is None:
1249         return None
1250
1251     m = re.match(
1252         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1253     if not m:
1254         return None
1255     res = int(m.group('secs'))
1256     if m.group('mins'):
1257         res += int(m.group('mins')) * 60
1258         if m.group('hours'):
1259             res += int(m.group('hours')) * 60 * 60
1260     return res
1261
1262
1263 def prepend_extension(filename, ext):
1264     name, real_ext = os.path.splitext(filename)
1265     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1266
1267
1268 def check_executable(exe, args=[]):
1269     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1270     args can be a list of arguments for a short output (like -version) """
1271     try:
1272         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1273     except OSError:
1274         return False
1275     return exe
1276
1277
1278 class PagedList(object):
1279     def __init__(self, pagefunc, pagesize):
1280         self._pagefunc = pagefunc
1281         self._pagesize = pagesize
1282
1283     def __len__(self):
1284         # This is only useful for tests
1285         return len(self.getslice())
1286
1287     def getslice(self, start=0, end=None):
1288         res = []
1289         for pagenum in itertools.count(start // self._pagesize):
1290             firstid = pagenum * self._pagesize
1291             nextfirstid = pagenum * self._pagesize + self._pagesize
1292             if start >= nextfirstid:
1293                 continue
1294
1295             page_results = list(self._pagefunc(pagenum))
1296
1297             startv = (
1298                 start % self._pagesize
1299                 if firstid <= start < nextfirstid
1300                 else 0)
1301
1302             endv = (
1303                 ((end - 1) % self._pagesize) + 1
1304                 if (end is not None and firstid <= end <= nextfirstid)
1305                 else None)
1306
1307             if startv != 0 or endv is not None:
1308                 page_results = page_results[startv:endv]
1309             res.extend(page_results)
1310
1311             # A little optimization - if current page is not "full", ie. does
1312             # not contain page_size videos then we can assume that this page
1313             # is the last one - there are no more ids on further pages -
1314             # i.e. no need to query again.
1315             if len(page_results) + startv < self._pagesize:
1316                 break
1317
1318             # If we got the whole page, but the next page is not interesting,
1319             # break out early as well
1320             if end == nextfirstid:
1321                 break
1322         return res
1323
1324
1325 def uppercase_escape(s):
1326     unicode_escape = codecs.getdecoder('unicode_escape')
1327     return re.sub(
1328         r'\\U[0-9a-fA-F]{8}',
1329         lambda m: unicode_escape(m.group(0))[0],
1330         s)
1331
1332 try:
1333     struct.pack(u'!I', 0)
1334 except TypeError:
1335     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1336     def struct_pack(spec, *args):
1337         if isinstance(spec, compat_str):
1338             spec = spec.encode('ascii')
1339         return struct.pack(spec, *args)
1340
1341     def struct_unpack(spec, *args):
1342         if isinstance(spec, compat_str):
1343             spec = spec.encode('ascii')
1344         return struct.unpack(spec, *args)
1345 else:
1346     struct_pack = struct.pack
1347     struct_unpack = struct.unpack
1348
1349
1350 def read_batch_urls(batch_fd):
1351     def fixup(url):
1352         if not isinstance(url, compat_str):
1353             url = url.decode('utf-8', 'replace')
1354         BOM_UTF8 = u'\xef\xbb\xbf'
1355         if url.startswith(BOM_UTF8):
1356             url = url[len(BOM_UTF8):]
1357         url = url.strip()
1358         if url.startswith(('#', ';', ']')):
1359             return False
1360         return url
1361
1362     with contextlib.closing(batch_fd) as fd:
1363         return [url for url in map(fixup, fd) if url]
1364
1365
1366 def urlencode_postdata(*args, **kargs):
1367     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1368
1369
1370 def parse_xml(s):
1371     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1372         def doctype(self, name, pubid, system):
1373             pass  # Ignore doctypes
1374
1375     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1376     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1377     return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1378
1379
1380 if sys.version_info < (3, 0) and sys.platform == 'win32':
1381     def compat_getpass(prompt, *args, **kwargs):
1382         if isinstance(prompt, compat_str):
1383             prompt = prompt.encode(preferredencoding())
1384         return getpass.getpass(prompt, *args, **kwargs)
1385 else:
1386     compat_getpass = getpass.getpass
1387
1388
1389 US_RATINGS = {
1390     'G': 0,
1391     'PG': 10,
1392     'PG-13': 13,
1393     'R': 16,
1394     'NC': 18,
1395 }
1396
1397
1398 def strip_jsonp(code):
1399     return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)