]> gitweb @ CieloNegro.org - youtube-dl.git/blobdiff - youtube_dl/InfoExtractors.py
Refactor IDParser to search for elements by any attribute not just ID
[youtube-dl.git] / youtube_dl / InfoExtractors.py
old mode 100644 (file)
new mode 100755 (executable)
index 1b37eb6..9a41dde
@@ -23,7 +23,7 @@ class InfoExtractor(object):
     Information extractors are the classes that, given a URL, extract
     information about the video (or videos) the URL refers to. This
     information includes the real video URL, the video title, author and
     Information extractors are the classes that, given a URL, extract
     information about the video (or videos) the URL refers to. This
     information includes the real video URL, the video title, author and
-    others. The information is stored in a dictionary which is then 
+    others. The information is stored in a dictionary which is then
     passed to the FileDownloader. The FileDownloader processes this
     information possibly downloading the video to the file system, among
     other possible outcomes.
     passed to the FileDownloader. The FileDownloader processes this
     information possibly downloading the video to the file system, among
     other possible outcomes.
@@ -159,7 +159,7 @@ class YoutubeIE(InfoExtractor):
         '44': '480x854',
         '45': '720x1280',
         '46': '1080x1920',
         '44': '480x854',
         '45': '720x1280',
         '46': '1080x1920',
-    }   
+    }
     IE_NAME = u'youtube'
 
     def suitable(self, url):
     IE_NAME = u'youtube'
 
     def suitable(self, url):
@@ -272,7 +272,7 @@ class YoutubeIE(InfoExtractor):
         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
         try:
             self.report_login()
         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
         try:
             self.report_login()
-            login_results = compat_urllib_request.urlopen(request).read()
+            login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
                 return
             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
                 return
@@ -288,7 +288,7 @@ class YoutubeIE(InfoExtractor):
         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
         try:
             self.report_age_confirmation()
         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
         try:
             self.report_age_confirmation()
-            age_results = compat_urllib_request.urlopen(request).read()
+            age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
             return
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
             return
@@ -399,7 +399,7 @@ class YoutubeIE(InfoExtractor):
                 self.report_video_subtitles_download(video_id)
                 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
                 try:
                 self.report_video_subtitles_download(video_id)
                 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
                 try:
-                    srt_list = compat_urllib_request.urlopen(request).read()
+                    srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
@@ -416,14 +416,14 @@ class YoutubeIE(InfoExtractor):
                     raise Trouble(u'WARNING: no closed captions found in the specified language')
                 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
                 try:
                     raise Trouble(u'WARNING: no closed captions found in the specified language')
                 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
                 try:
-                    srt_xml = compat_urllib_request.urlopen(request).read()
+                    srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
                 if not srt_xml:
                     raise Trouble(u'WARNING: unable to download video subtitles')
                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
                 if not srt_xml:
                     raise Trouble(u'WARNING: unable to download video subtitles')
-                video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
+                video_subtitles = self._closed_captions_xml_to_srt(srt_xml)
             except Trouble as trouble:
             except Trouble as trouble:
-                self._downloader.trouble(trouble[0])
+                self._downloader.trouble(str(trouble))
 
         if 'length_seconds' not in video_info:
             self._downloader.trouble(u'WARNING: unable to extract video duration')
 
         if 'length_seconds' not in video_info:
             self._downloader.trouble(u'WARNING: unable to extract video duration')
@@ -666,7 +666,8 @@ class DailymotionIE(InfoExtractor):
         request.add_header('Cookie', 'family_filter=off')
         try:
             self.report_download_webpage(video_id)
         request.add_header('Cookie', 'family_filter=off')
         try:
             self.report_download_webpage(video_id)
-            webpage = compat_urllib_request.urlopen(request).read()
+            webpage_bytes = compat_urllib_request.urlopen(request).read()
+            webpage = webpage_bytes.decode('utf-8')
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
             return
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
             return
@@ -701,7 +702,7 @@ class DailymotionIE(InfoExtractor):
         if mobj is None:
             self._downloader.trouble(u'ERROR: unable to extract title')
             return
         if mobj is None:
             self._downloader.trouble(u'ERROR: unable to extract title')
             return
-        video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
+        video_title = unescapeHTML(mobj.group('title'))
 
         video_uploader = None
         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 
         video_uploader = None
         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
@@ -721,105 +722,12 @@ class DailymotionIE(InfoExtractor):
             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 
         return [{
             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 
         return [{
-            'id':       video_id.decode('utf-8'),
-            'url':      video_url.decode('utf-8'),
-            'uploader': video_uploader.decode('utf-8'),
+            'id':       video_id,
+            'url':      video_url,
+            'uploader': video_uploader,
             'upload_date':  video_upload_date,
             'title':    video_title,
             'upload_date':  video_upload_date,
             'title':    video_title,
-            'ext':      video_extension.decode('utf-8'),
-        }]
-
-
-class GoogleIE(InfoExtractor):
-    """Information extractor for video.google.com."""
-
-    _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
-    IE_NAME = u'video.google'
-
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
-    def report_download_webpage(self, video_id):
-        """Report webpage download."""
-        self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
-
-    def report_extraction(self, video_id):
-        """Report information extraction."""
-        self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
-
-    def _real_extract(self, url):
-        # Extract id from URL
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
-            return
-
-        video_id = mobj.group(1)
-
-        video_extension = 'mp4'
-
-        # Retrieve video webpage to extract further information
-        request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
-        try:
-            self.report_download_webpage(video_id)
-            webpage = compat_urllib_request.urlopen(request).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
-            return
-
-        # Extract URL, uploader, and title from webpage
-        self.report_extraction(video_id)
-        mobj = re.search(r"download_url:'([^']+)'", webpage)
-        if mobj is None:
-            video_extension = 'flv'
-            mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
-        if mobj is None:
-            self._downloader.trouble(u'ERROR: unable to extract media URL')
-            return
-        mediaURL = compat_urllib_parse.unquote(mobj.group(1))
-        mediaURL = mediaURL.replace('\\x3d', '\x3d')
-        mediaURL = mediaURL.replace('\\x26', '\x26')
-
-        video_url = mediaURL
-
-        mobj = re.search(r'<title>(.*)</title>', webpage)
-        if mobj is None:
-            self._downloader.trouble(u'ERROR: unable to extract title')
-            return
-        video_title = mobj.group(1).decode('utf-8')
-
-        # Extract video description
-        mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
-        if mobj is None:
-            self._downloader.trouble(u'ERROR: unable to extract video description')
-            return
-        video_description = mobj.group(1).decode('utf-8')
-        if not video_description:
-            video_description = 'No description available.'
-
-        # Extract video thumbnail
-        if self._downloader.params.get('forcethumbnail', False):
-            request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
-            try:
-                webpage = compat_urllib_request.urlopen(request).read()
-            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
-                return
-            mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
-            if mobj is None:
-                self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
-                return
-            video_thumbnail = mobj.group(1)
-        else:   # we need something to pass to process_info
-            video_thumbnail = ''
-
-        return [{
-            'id':       video_id.decode('utf-8'),
-            'url':      video_url.decode('utf-8'),
-            'uploader': None,
-            'upload_date':  None,
-            'title':    video_title,
-            'ext':      video_extension.decode('utf-8'),
+            'ext':      video_extension,
         }]
 
 
         }]
 
 
@@ -891,6 +799,7 @@ class PhotobucketIE(InfoExtractor):
 class YahooIE(InfoExtractor):
     """Information extractor for video.yahoo.com."""
 
 class YahooIE(InfoExtractor):
     """Information extractor for video.yahoo.com."""
 
+    _WORKING = False
     # _VALID_URL matches all Yahoo! Video URLs
     # _VPAGE_URL matches only the extractable '/watch/' URLs
     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
     # _VALID_URL matches all Yahoo! Video URLs
     # _VPAGE_URL matches only the extractable '/watch/' URLs
     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
@@ -1061,7 +970,8 @@ class VimeoIE(InfoExtractor):
         request = compat_urllib_request.Request(url, None, std_headers)
         try:
             self.report_download_webpage(video_id)
         request = compat_urllib_request.Request(url, None, std_headers)
         try:
             self.report_download_webpage(video_id)
-            webpage = compat_urllib_request.urlopen(request).read()
+            webpage_bytes = compat_urllib_request.urlopen(request).read()
+            webpage = webpage_bytes.decode('utf-8')
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
             return
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
             return
@@ -1078,7 +988,7 @@ class VimeoIE(InfoExtractor):
         except:
             self._downloader.trouble(u'ERROR: unable to extract info section')
             return
         except:
             self._downloader.trouble(u'ERROR: unable to extract info section')
             return
-        
+
         # Extract title
         video_title = config["video"]["title"]
 
         # Extract title
         video_title = config["video"]["title"]
 
@@ -1089,7 +999,7 @@ class VimeoIE(InfoExtractor):
         video_thumbnail = config["video"]["thumbnail"]
 
         # Extract video description
         video_thumbnail = config["video"]["thumbnail"]
 
         # Extract video description
-        video_description = get_element_by_id("description", webpage.decode('utf8'))
+        video_description = get_element_by_id("description", webpage)
         if video_description: video_description = clean_html(video_description)
         else: video_description = ''
 
         if video_description: video_description = clean_html(video_description)
         else: video_description = ''
 
@@ -1261,7 +1171,7 @@ class ArteTvIE(InfoExtractor):
             'url':          compat_urllib_parse.unquote(info.get('url')),
             'uploader':     u'arte.tv',
             'upload_date':  info.get('date'),
             'url':          compat_urllib_parse.unquote(info.get('url')),
             'uploader':     u'arte.tv',
             'upload_date':  info.get('date'),
-            'title':        info.get('title'),
+            'title':        info.get('title').decode('utf-8'),
             'ext':          u'mp4',
             'format':       u'NA',
             'player_url':   None,
             'ext':          u'mp4',
             'format':       u'NA',
             'player_url':   None,
@@ -1301,7 +1211,7 @@ class GenericIE(InfoExtractor):
     def report_following_redirect(self, new_url):
         """Report information extraction."""
         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
     def report_following_redirect(self, new_url):
         """Report information extraction."""
         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
-        
+
     def _test_redirect(self, url):
         """Check if it is a redirect, like url shorteners, in case restart chain."""
         class HeadRequest(compat_urllib_request.Request):
     def _test_redirect(self, url):
         """Check if it is a redirect, like url shorteners, in case restart chain."""
         class HeadRequest(compat_urllib_request.Request):
@@ -1310,38 +1220,38 @@ class GenericIE(InfoExtractor):
 
         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
             """
 
         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
             """
-            Subclass the HTTPRedirectHandler to make it use our 
+            Subclass the HTTPRedirectHandler to make it use our
             HeadRequest also on the redirected URL
             """
             HeadRequest also on the redirected URL
             """
-            def redirect_request(self, req, fp, code, msg, headers, newurl): 
+            def redirect_request(self, req, fp, code, msg, headers, newurl):
                 if code in (301, 302, 303, 307):
                 if code in (301, 302, 303, 307):
-                    newurl = newurl.replace(' ', '%20') 
+                    newurl = newurl.replace(' ', '%20')
                     newheaders = dict((k,v) for k,v in req.headers.items()
                                       if k.lower() not in ("content-length", "content-type"))
                     newheaders = dict((k,v) for k,v in req.headers.items()
                                       if k.lower() not in ("content-length", "content-type"))
-                    return HeadRequest(newurl, 
+                    return HeadRequest(newurl,
                                        headers=newheaders,
                                        headers=newheaders,
-                                       origin_req_host=req.get_origin_req_host(), 
-                                       unverifiable=True) 
-                else: 
-                    raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) 
+                                       origin_req_host=req.get_origin_req_host(),
+                                       unverifiable=True)
+                else:
+                    raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
 
         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
             """
             Fallback to GET if HEAD is not allowed (405 HTTP error)
             """
 
         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
             """
             Fallback to GET if HEAD is not allowed (405 HTTP error)
             """
-            def http_error_405(self, req, fp, code, msg, headers): 
+            def http_error_405(self, req, fp, code, msg, headers):
                 fp.read()
                 fp.close()
 
                 newheaders = dict((k,v) for k,v in req.headers.items()
                                   if k.lower() not in ("content-length", "content-type"))
                 fp.read()
                 fp.close()
 
                 newheaders = dict((k,v) for k,v in req.headers.items()
                                   if k.lower() not in ("content-length", "content-type"))
-                return self.parent.open(compat_urllib_request.Request(req.get_full_url(), 
-                                                 headers=newheaders, 
-                                                 origin_req_host=req.get_origin_req_host(), 
+                return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
+                                                 headers=newheaders,
+                                                 origin_req_host=req.get_origin_req_host(),
                                                  unverifiable=True))
 
         # Build our opener
                                                  unverifiable=True))
 
         # Build our opener
-        opener = compat_urllib_request.OpenerDirector() 
+        opener = compat_urllib_request.OpenerDirector()
         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
                         HTTPMethodFallback, HEADRedirectHandler,
                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
                         HTTPMethodFallback, HEADRedirectHandler,
                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
@@ -1407,22 +1317,22 @@ class GenericIE(InfoExtractor):
         if mobj is None:
             self._downloader.trouble(u'ERROR: unable to extract title')
             return
         if mobj is None:
             self._downloader.trouble(u'ERROR: unable to extract title')
             return
-        video_title = mobj.group(1).decode('utf-8')
+        video_title = mobj.group(1)
 
         # video uploader is domain name
         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
         if mobj is None:
             self._downloader.trouble(u'ERROR: unable to extract title')
             return
 
         # video uploader is domain name
         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
         if mobj is None:
             self._downloader.trouble(u'ERROR: unable to extract title')
             return
-        video_uploader = mobj.group(1).decode('utf-8')
+        video_uploader = mobj.group(1)
 
         return [{
 
         return [{
-            'id':       video_id.decode('utf-8'),
-            'url':      video_url.decode('utf-8'),
+            'id':       video_id,
+            'url':      video_url,
             'uploader': video_uploader,
             'upload_date':  None,
             'title':    video_title,
             'uploader': video_uploader,
             'upload_date':  None,
             'title':    video_title,
-            'ext':      video_extension.decode('utf-8'),
+            'ext':      video_extension,
         }]
 
 
         }]
 
 
@@ -1586,6 +1496,8 @@ class GoogleSearchIE(InfoExtractor):
 
 class YahooSearchIE(InfoExtractor):
     """Information Extractor for Yahoo! Video search queries."""
 
 class YahooSearchIE(InfoExtractor):
     """Information Extractor for Yahoo! Video search queries."""
+
+    _WORKING = False
     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
@@ -1713,7 +1625,7 @@ class YoutubePlaylistIE(InfoExtractor):
             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
             request = compat_urllib_request.Request(url)
             try:
             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
             request = compat_urllib_request.Request(url)
             try:
-                page = compat_urllib_request.urlopen(request).read().decode('utf8')
+                page = compat_urllib_request.urlopen(request).read().decode('utf-8')
             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
                 return
             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
                 return
@@ -1842,7 +1754,7 @@ class YoutubeUserIE(InfoExtractor):
             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
 
             try:
             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
 
             try:
-                page = compat_urllib_request.urlopen(request).read()
+                page = compat_urllib_request.urlopen(request).read().decode('utf-8')
             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
                 return
             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
                 return
@@ -2344,7 +2256,7 @@ class MyVideoIE(InfoExtractor):
 
     def __init__(self, downloader=None):
         InfoExtractor.__init__(self, downloader)
 
     def __init__(self, downloader=None):
         InfoExtractor.__init__(self, downloader)
-    
+
     def report_download_webpage(self, video_id):
         """Report webpage download."""
         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
     def report_download_webpage(self, video_id):
         """Report webpage download."""
         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
@@ -2365,7 +2277,7 @@ class MyVideoIE(InfoExtractor):
         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
         try:
             self.report_download_webpage(video_id)
         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
         try:
             self.report_download_webpage(video_id)
-            webpage = compat_urllib_request.urlopen(request).read()
+            webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
             return
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
             return
@@ -2398,10 +2310,10 @@ class ComedyCentralIE(InfoExtractor):
     """Information extractor for The Daily Show and Colbert Report """
 
     # urls can be abbreviations like :thedailyshow or :colbert
     """Information extractor for The Daily Show and Colbert Report """
 
     # urls can be abbreviations like :thedailyshow or :colbert
-    # urls for episodes like: 
+    # urls for episodes like:
     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
-    #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524    
+    #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
                       |(https?://)?(www\.)?
                           (?P<showname>thedailyshow|colbertnation)\.com/
     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
                       |(https?://)?(www\.)?
                           (?P<showname>thedailyshow|colbertnation)\.com/
@@ -2409,7 +2321,7 @@ class ComedyCentralIE(InfoExtractor):
                           (?P<clip>
                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
                           (?P<clip>
                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
-                     $"""                        
+                     $"""
     IE_NAME = u'comedycentral'
 
     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
     IE_NAME = u'comedycentral'
 
     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
@@ -2513,7 +2425,7 @@ class ComedyCentralIE(InfoExtractor):
                 return
             else:
                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
                 return
             else:
                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
-        
+
         playerUrl_raw = mMovieParams[0][0]
         self.report_player_url(epTitle)
         try:
         playerUrl_raw = mMovieParams[0][0]
         self.report_player_url(epTitle)
         try:
@@ -2562,7 +2474,7 @@ class ComedyCentralIE(InfoExtractor):
             if len(turls) == 0:
                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
                 continue
             if len(turls) == 0:
                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
                 continue
-            
+
             if self._downloader.params.get('listformats', None):
                 self._print_formats([i[0] for i in turls])
                 return
             if self._downloader.params.get('listformats', None):
                 self._print_formats([i[0] for i in turls])
                 return
@@ -2602,7 +2514,7 @@ class ComedyCentralIE(InfoExtractor):
             }
 
             results.append(info)
             }
 
             results.append(info)
-            
+
         return results
 
 
         return results
 
 
@@ -2647,7 +2559,9 @@ class EscapistIE(InfoExtractor):
 
         self.report_config_download(showName)
         try:
 
         self.report_config_download(showName)
         try:
-            configJSON = compat_urllib_request.urlopen(configUrl).read()
+            configJSON = compat_urllib_request.urlopen(configUrl)
+            m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
+            configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
             return
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
             return
@@ -2770,13 +2684,14 @@ class XVideosIE(InfoExtractor):
         if mobj is None:
             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
             return
         if mobj is None:
             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
             return
-        video_id = mobj.group(1).decode('utf-8')
+        video_id = mobj.group(1)
 
         self.report_webpage(video_id)
 
         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
         try:
 
         self.report_webpage(video_id)
 
         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
         try:
-            webpage = compat_urllib_request.urlopen(request).read()
+            webpage_bytes = compat_urllib_request.urlopen(request).read()
+            webpage = webpage_bytes.decode('utf-8', 'replace')
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
             return
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
             return
@@ -2789,7 +2704,7 @@ class XVideosIE(InfoExtractor):
         if mobj is None:
             self._downloader.trouble(u'ERROR: unable to extract video url')
             return
         if mobj is None:
             self._downloader.trouble(u'ERROR: unable to extract video url')
             return
-        video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
+        video_url = compat_urllib_parse.unquote(mobj.group(1))
 
 
         # Extract title
 
 
         # Extract title
@@ -2797,7 +2712,7 @@ class XVideosIE(InfoExtractor):
         if mobj is None:
             self._downloader.trouble(u'ERROR: unable to extract video title')
             return
         if mobj is None:
             self._downloader.trouble(u'ERROR: unable to extract video title')
             return
-        video_title = mobj.group(1).decode('utf-8')
+        video_title = mobj.group(1)
 
 
         # Extract video thumbnail
 
 
         # Extract video thumbnail
@@ -2805,7 +2720,7 @@ class XVideosIE(InfoExtractor):
         if mobj is None:
             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
             return
         if mobj is None:
             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
             return
-        video_thumbnail = mobj.group(0).decode('utf-8')
+        video_thumbnail = mobj.group(0)
 
         info = {
             'id': video_id,
 
         info = {
             'id': video_id,
@@ -2966,6 +2881,8 @@ class InfoQIE(InfoExtractor):
 
 class MixcloudIE(InfoExtractor):
     """Information extractor for www.mixcloud.com"""
 
 class MixcloudIE(InfoExtractor):
     """Information extractor for www.mixcloud.com"""
+
+    _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
     IE_NAME = u'mixcloud'
 
     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
     IE_NAME = u'mixcloud'
 
@@ -3161,7 +3078,7 @@ class StanfordOpenClassroomIE(InfoExtractor):
                 assert entry['type'] == 'reference'
                 results += self.extract(entry['url'])
             return results
                 assert entry['type'] == 'reference'
                 results += self.extract(entry['url'])
             return results
-            
+
         else: # Root page
             info = {
                 'id': 'Stanford OpenClassroom',
         else: # Root page
             info = {
                 'id': 'Stanford OpenClassroom',
@@ -3235,7 +3152,7 @@ class MTVIE(InfoExtractor):
             self._downloader.trouble(u'ERROR: unable to extract performer')
             return
         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
             self._downloader.trouble(u'ERROR: unable to extract performer')
             return
         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
-        video_title = performer + ' - ' + song_name 
+        video_title = performer + ' - ' + song_name
 
         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
         if mobj is None:
 
         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
         if mobj is None:
@@ -3348,7 +3265,8 @@ class YoukuIE(InfoExtractor):
 
         self.report_extraction(video_id)
         try:
 
         self.report_extraction(video_id)
         try:
-            config = json.loads(jsondata)
+            jsonstr = jsondata.decode('utf-8')
+            config = json.loads(jsonstr)
 
             video_title =  config['data'][0]['title']
             seed = config['data'][0]['seed']
 
             video_title =  config['data'][0]['title']
             seed = config['data'][0]['seed']
@@ -3371,15 +3289,8 @@ class YoukuIE(InfoExtractor):
 
 
             fileid = config['data'][0]['streamfileids'][format]
 
 
             fileid = config['data'][0]['streamfileids'][format]
-            seg_number = len(config['data'][0]['segs'][format])
-
-            keys=[]
-            for i in xrange(seg_number):
-                keys.append(config['data'][0]['segs'][format][i]['k'])
-
-            #TODO check error
-            #youku only could be viewed from mainland china
-        except:
+            keys = [s['k'] for s in config['data'][0]['segs'][format]]
+        except (UnicodeDecodeError, ValueError, KeyError):
             self._downloader.trouble(u'ERROR: unable to extract info section')
             return
 
             self._downloader.trouble(u'ERROR: unable to extract info section')
             return
 
@@ -3429,13 +3340,14 @@ class XNXXIE(InfoExtractor):
         if mobj is None:
             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
             return
         if mobj is None:
             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
             return
-        video_id = mobj.group(1).decode('utf-8')
+        video_id = mobj.group(1)
 
         self.report_webpage(video_id)
 
         # Get webpage content
         try:
 
         self.report_webpage(video_id)
 
         # Get webpage content
         try:
-            webpage = compat_urllib_request.urlopen(url).read()
+            webpage_bytes = compat_urllib_request.urlopen(url).read()
+            webpage = webpage_bytes.decode('utf-8')
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
             return
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
             return
@@ -3444,19 +3356,19 @@ class XNXXIE(InfoExtractor):
         if result is None:
             self._downloader.trouble(u'ERROR: unable to extract video url')
             return
         if result is None:
             self._downloader.trouble(u'ERROR: unable to extract video url')
             return
-        video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
+        video_url = compat_urllib_parse.unquote(result.group(1))
 
         result = re.search(self.VIDEO_TITLE_RE, webpage)
         if result is None:
             self._downloader.trouble(u'ERROR: unable to extract video title')
             return
 
         result = re.search(self.VIDEO_TITLE_RE, webpage)
         if result is None:
             self._downloader.trouble(u'ERROR: unable to extract video title')
             return
-        video_title = result.group(1).decode('utf-8')
+        video_title = result.group(1)
 
         result = re.search(self.VIDEO_THUMB_RE, webpage)
         if result is None:
             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
             return
 
         result = re.search(self.VIDEO_THUMB_RE, webpage)
         if result is None:
             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
             return
-        video_thumbnail = result.group(1).decode('utf-8')
+        video_thumbnail = result.group(1)
 
         return [{
             'id': video_id,
 
         return [{
             'id': video_id,
@@ -3473,7 +3385,7 @@ class XNXXIE(InfoExtractor):
 class GooglePlusIE(InfoExtractor):
     """Information extractor for plus.google.com."""
 
 class GooglePlusIE(InfoExtractor):
     """Information extractor for plus.google.com."""
 
-    _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
+    _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
     IE_NAME = u'plus.google'
 
     def __init__(self, downloader=None):
     IE_NAME = u'plus.google'
 
     def __init__(self, downloader=None):
@@ -3481,7 +3393,7 @@ class GooglePlusIE(InfoExtractor):
 
     def report_extract_entry(self, url):
         """Report downloading extry"""
 
     def report_extract_entry(self, url):
         """Report downloading extry"""
-        self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
+        self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
 
     def report_date(self, upload_date):
         """Report downloading extry"""
 
     def report_date(self, upload_date):
         """Report downloading extry"""
@@ -3489,15 +3401,15 @@ class GooglePlusIE(InfoExtractor):
 
     def report_uploader(self, uploader):
         """Report downloading extry"""
 
     def report_uploader(self, uploader):
         """Report downloading extry"""
-        self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
+        self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
 
     def report_title(self, video_title):
         """Report downloading extry"""
 
     def report_title(self, video_title):
         """Report downloading extry"""
-        self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
+        self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
 
     def report_extract_vid_page(self, video_page):
         """Report information extraction."""
 
     def report_extract_vid_page(self, video_page):
         """Report information extraction."""
-        self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
+        self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
 
     def _real_extract(self, url):
         # Extract id from URL
 
     def _real_extract(self, url):
         # Extract id from URL
@@ -3507,7 +3419,7 @@ class GooglePlusIE(InfoExtractor):
             return
 
         post_url = mobj.group(0)
             return
 
         post_url = mobj.group(0)
-        video_id = mobj.group(2)
+        video_id = mobj.group(1)
 
         video_extension = 'flv'
 
 
         video_extension = 'flv'
 
@@ -3515,7 +3427,7 @@ class GooglePlusIE(InfoExtractor):
         self.report_extract_entry(post_url)
         request = compat_urllib_request.Request(post_url)
         try:
         self.report_extract_entry(post_url)
         request = compat_urllib_request.Request(post_url)
         try:
-            webpage = compat_urllib_request.urlopen(request).read()
+            webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
             return
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
             return
@@ -3557,7 +3469,7 @@ class GooglePlusIE(InfoExtractor):
         video_page = mobj.group(1)
         request = compat_urllib_request.Request(video_page)
         try:
         video_page = mobj.group(1)
         request = compat_urllib_request.Request(video_page)
         try:
-            webpage = compat_urllib_request.urlopen(request).read()
+            webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
             return
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
             return
@@ -3579,14 +3491,142 @@ class GooglePlusIE(InfoExtractor):
         # Only get the url. The resolution part in the tuple has no use anymore
         video_url = video_url[-1]
         # Treat escaped \u0026 style hex
         # Only get the url. The resolution part in the tuple has no use anymore
         video_url = video_url[-1]
         # Treat escaped \u0026 style hex
-        video_url = unicode(video_url, "unicode_escape")
+        try:
+            video_url = video_url.decode("unicode_escape")
+        except AttributeError: # Python 3
+            video_url = bytes(video_url, 'ascii').decode('unicode-escape')
 
 
         return [{
 
 
         return [{
-            'id':       video_id.decode('utf-8'),
+            'id':       video_id,
             'url':      video_url,
             'url':      video_url,
-            'uploader': uploader.decode('utf-8'),
-            'upload_date':  upload_date.decode('utf-8'),
-            'title':    video_title.decode('utf-8'),
-            'ext':      video_extension.decode('utf-8'),
+            'uploader': uploader,
+            'upload_date':  upload_date,
+            'title':    video_title,
+            'ext':      video_extension,
         }]
         }]
+
+class NBAIE(InfoExtractor):
+    _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
+    IE_NAME = u'nba'
+
+    def report_extraction(self, video_id):
+        self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+            return
+
+        video_id = mobj.group(1)
+        if video_id.endswith('/index.html'):
+            video_id = video_id[:-len('/index.html')]
+
+        self.report_extraction(video_id)
+        try:
+            urlh = compat_urllib_request.urlopen(url)
+            webpage_bytes = urlh.read()
+            webpage = webpage_bytes.decode('utf-8', 'ignore')
+        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+            self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
+            return
+
+        video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
+        def _findProp(rexp, default=None):
+            m = re.search(rexp, webpage)
+            if m:
+                return unescapeHTML(m.group(1))
+            else:
+                return default
+
+        shortened_video_id = video_id.rpartition('/')[2]
+        title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
+        info = {
+            'id': shortened_video_id,
+            'url': video_url,
+            'ext': 'mp4',
+            'title': title,
+            'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
+            'description': _findProp(r'<div class="description">(.*?)</h1>'),
+        }
+        return [info]
+
+class JustinTVIE(InfoExtractor):
+    """Information extractor for justin.tv and twitch.tv"""
+    # TODO: One broadcast may be split into multiple videos. The key
+    # 'broadcast_id' is the same for all parts, and 'broadcast_part'
+    # starts at 1 and increases. Can we treat all parts as one video?
+
+    _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
+        ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
+    _JUSTIN_PAGE_LIMIT = 100
+    IE_NAME = u'justin.tv'
+
+    def report_extraction(self, file_id):
+        """Report information extraction."""
+        self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
+
+    def report_download_page(self, channel, offset):
+        """Report attempt to download a single page of videos."""
+        self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
+                (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
+
+    # Return count of items, list of *valid* items
+    def _parse_page(self, url):
+        try:
+            urlh = compat_urllib_request.urlopen(url)
+            webpage_bytes = urlh.read()
+            webpage = webpage_bytes.decode('utf-8', 'ignore')
+        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+            self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
+            return
+
+        response = json.loads(webpage)
+        info = []
+        for clip in response:
+            video_url = clip['video_file_url']
+            if video_url:
+                video_extension = os.path.splitext(video_url)[1][1:]
+                video_date = re.sub('-', '', clip['created_on'][:10])
+                info.append({
+                    'id': clip['id'],
+                    'url': video_url,
+                    'title': clip['title'],
+                    'uploader': clip.get('user_id', clip.get('channel_id')),
+                    'upload_date': video_date,
+                    'ext': video_extension,
+                })
+        return (len(response), info)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+            return
+
+        api = 'http://api.justin.tv'
+        video_id = mobj.group(mobj.lastindex)
+        paged = False
+        if mobj.lastindex == 1:
+            paged = True
+            api += '/channel/archives/%s.json'
+        else:
+            api += '/clip/show/%s.json'
+        api = api % (video_id,)
+
+        self.report_extraction(video_id)
+
+        info = []
+        offset = 0
+        limit = self._JUSTIN_PAGE_LIMIT
+        while True:
+            if paged:
+                self.report_download_page(video_id, offset)
+            page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
+            page_count, page_info = self._parse_page(page_url)
+            info.extend(page_info)
+            if not paged or page_count != limit:
+                break
+            offset += limit
+        return info