X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=672ef9eedb40b1f8aa7db86e0a20b591c88511f3;hb=2491f5898ecdb0828a4619a983239657b190c4fa;hp=8fcd19ca2146ec648df075d8eb7565728dc4963b;hpb=cbf46c737c3f4156dee019b70521dcd3194877ac;p=youtube-dl.git
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 8fcd19ca2..672ef9eed 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -1,1800 +1,4 @@
-import base64
-import datetime
-import itertools
-import netrc
-import os
-import re
-import socket
-import time
-import email.utils
-import xml.etree.ElementTree
-import random
-import math
-import operator
-import hashlib
-import binascii
-import urllib
+# Legacy file for backwards compatibility, use youtube_dl.extractor instead!
-from .utils import *
from .extractor.common import InfoExtractor, SearchInfoExtractor
-
-from .extractor.ard import ARDIE
-from .extractor.arte import ArteTvIE
-from .extractor.bliptv import BlipTVIE, BlipTVUserIE
-from .extractor.comedycentral import ComedyCentralIE
-from .extractor.collegehumor import CollegeHumorIE
-from .extractor.dailymotion import DailymotionIE
-from .extractor.depositfiles import DepositFilesIE
-from .extractor.escapist import EscapistIE
-from .extractor.facebook import FacebookIE
-from .extractor.gametrailers import GametrailersIE
-from .extractor.generic import GenericIE
-from .extractor.googleplus import GooglePlusIE
-from .extractor.googlesearch import GoogleSearchIE
-from .extractor.metacafe import MetacafeIE
-from .extractor.myvideo import MyVideoIE
-from .extractor.statigram import StatigramIE
-from .extractor.photobucket import PhotobucketIE
-from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
-from .extractor.vimeo import VimeoIE
-from .extractor.xvideos import XVideosIE
-from .extractor.yahoo import YahooIE, YahooSearchIE
-from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
-from .extractor.zdf import ZDFIE
-
-
-
-class InfoQIE(InfoExtractor):
- """Information extractor for infoq.com"""
- _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- webpage = self._download_webpage(url, video_id=url)
- self.report_extraction(url)
-
- # Extract video URL
- mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract video url')
- real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
- video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
-
- # Extract title
- video_title = self._search_regex(r'contentTitle = "(.*?)";',
- webpage, u'title')
-
- # Extract description
- video_description = self._html_search_regex(r'',
- webpage, u'description', fatal=False)
-
- video_filename = video_url.split('/')[-1]
- video_id, extension = video_filename.split('.')
-
- info = {
- 'id': video_id,
- 'url': video_url,
- 'uploader': None,
- 'upload_date': None,
- 'title': video_title,
- 'ext': extension, # Extension is always(?) mp4, but seems to be flv
- 'thumbnail': None,
- 'description': video_description,
- }
-
- return [info]
-
-class MixcloudIE(InfoExtractor):
- """Information extractor for www.mixcloud.com"""
-
- _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
- _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
- IE_NAME = u'mixcloud'
-
- def report_download_json(self, file_id):
- """Report JSON download."""
- self.to_screen(u'Downloading json')
-
- def get_urls(self, jsonData, fmt, bitrate='best'):
- """Get urls from 'audio_formats' section in json"""
- file_url = None
- try:
- bitrate_list = jsonData[fmt]
- if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
- bitrate = max(bitrate_list) # select highest
-
- url_list = jsonData[fmt][bitrate]
- except TypeError: # we have no bitrate info.
- url_list = jsonData[fmt]
- return url_list
-
- def check_urls(self, url_list):
- """Returns 1st active url from list"""
- for url in url_list:
- try:
- compat_urllib_request.urlopen(url)
- return url
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- url = None
-
- return None
-
- def _print_formats(self, formats):
- print('Available formats:')
- for fmt in formats.keys():
- for b in formats[fmt]:
- try:
- ext = formats[fmt][b][0]
- print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
- except TypeError: # we have no bitrate info
- ext = formats[fmt][0]
- print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
- break
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
- # extract uploader & filename from url
- uploader = mobj.group(1).decode('utf-8')
- file_id = uploader + "-" + mobj.group(2).decode('utf-8')
-
- # construct API request
- file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
- # retrieve .json file with links to files
- request = compat_urllib_request.Request(file_url)
- try:
- self.report_download_json(file_url)
- jsonData = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
-
- # parse JSON
- json_data = json.loads(jsonData)
- player_url = json_data['player_swf_url']
- formats = dict(json_data['audio_formats'])
-
- req_format = self._downloader.params.get('format', None)
- bitrate = None
-
- if self._downloader.params.get('listformats', None):
- self._print_formats(formats)
- return
-
- if req_format is None or req_format == 'best':
- for format_param in formats.keys():
- url_list = self.get_urls(formats, format_param)
- # check urls
- file_url = self.check_urls(url_list)
- if file_url is not None:
- break # got it!
- else:
- if req_format not in formats:
- raise ExtractorError(u'Format is not available')
-
- url_list = self.get_urls(formats, req_format)
- file_url = self.check_urls(url_list)
- format_param = req_format
-
- return [{
- 'id': file_id.decode('utf-8'),
- 'url': file_url.decode('utf-8'),
- 'uploader': uploader.decode('utf-8'),
- 'upload_date': None,
- 'title': json_data['name'],
- 'ext': file_url.split('.')[-1].decode('utf-8'),
- 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
- 'thumbnail': json_data['thumbnail_url'],
- 'description': json_data['description'],
- 'player_url': player_url.decode('utf-8'),
- }]
-
-class StanfordOpenClassroomIE(InfoExtractor):
- """Information extractor for Stanford's Open ClassRoom"""
-
- _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P[^&]+)(&video=(?P