video_info['title'] = playlist_title
return video_info
- def _search_regex(self, pattern, text, name, fatal=True, flags=0):
- """Extract a field from some text based on regex"""
- mobj = re.search(pattern, text, flags)
- if mobj is None and fatal:
- raise ExtractorError(u'Unable to extract %s; '
- u'please report this issue on GitHub.' % name)
- elif mobj is None:
+ def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+ """
+ Perform a regex search on the given string, using a single or a list of
+ patterns returning the first matching group.
+ In case of failure return a default value or raise a WARNING or a
+ ExtractorError, depending on fatal, specifying the field name.
+ """
+ if isinstance(pattern, (str, compat_str, compiled_regex_type)):
+ mobj = re.search(pattern, string, flags)
+ else:
+ for p in pattern:
+ mobj = re.search(p, string, flags)
+ if mobj: break
+
+ if sys.stderr.isatty() and os.name != 'nt':
+ _name = u'\033[0;34m%s\033[0m' % name
+ else:
+ _name = name
+
+ if mobj:
+ # return the first matching group
+ return next(g for g in mobj.groups() if g is not None)
+ elif default is not None:
+ return default
+ elif fatal:
+ raise ExtractorError(u'Unable to extract %s' % _name)
+ else:
self._downloader.report_warning(u'unable to extract %s; '
- u'please report this issue on GitHub.' % name)
+ u'please report this issue on GitHub.' % _name)
return None
+
+ def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+ """
+ Like _search_regex, but strips HTML tags and unescapes entities.
+ """
+ res = self._search_regex(pattern, string, name, default, fatal, flags)
+ if res:
+ return clean_html(res).strip()
else:
- # return the first matched group
- return next(g for g in mobj.groups() if g is not None)
+ return res
class SearchInfoExtractor(InfoExtractor):
"""
video_duration = int(video_data['video_duration'])
thumbnail = video_data['thumbnail_src']
- video_title = self._search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
+ video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
webpage, u'title')
- video_title = unescapeHTML(video_title)
info = {
'id': video_id,
self.report_extraction(video_id)
video_url = mobj.group(1) + '.flv'
- video_title = self._search_regex('<title>([^<]+)</title>',
+ video_title = self._html_search_regex('<title>([^<]+)</title>',
webpage, u'title')
video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
video_swfobj = compat_urllib_parse.unquote(video_swfobj)
- video_title = self._search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
+ video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
webpage, u'title')
return [{
showName = mobj.group('showname')
videoId = mobj.group('episode')
- self.report_extraction(showName)
- webpage = self._download_webpage(url, showName)
+ self.report_extraction(videoId)
+ webpage = self._download_webpage(url, videoId)
- videoDesc = self._search_regex('<meta name="description" content="([^"]*)"',
+ videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
webpage, u'description', fatal=False)
- if videoDesc: videoDesc = unescapeHTML(videoDesc)
- imgUrl = self._search_regex('<meta property="og:image" content="([^"]*)"',
+ imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
webpage, u'thumbnail', fatal=False)
- if imgUrl: imgUrl = unescapeHTML(imgUrl)
- playerUrl = self._search_regex('<meta property="og:video" content="([^"]*)"',
+ playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
webpage, u'player url')
- playerUrl = unescapeHTML(playerUrl)
+
+ title = self._html_search_regex('<meta name="title" content="([^"]*)"',
+ webpage, u'player url').split(' : ')[-1]
configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
configUrl = compat_urllib_parse.unquote(configUrl)
- configJSON = self._download_webpage(configUrl, showName,
+ configJSON = self._download_webpage(configUrl, videoId,
u'Downloading configuration',
u'unable to download configuration')
'url': videoUrl,
'uploader': showName,
'upload_date': None,
- 'title': showName,
+ 'title': title,
'ext': 'mp4',
'thumbnail': imgUrl,
'description': videoDesc,
webpage, u'video URL'))
# Extract title
- video_title = self._search_regex(r'<title>(.*?)\s+-\s+XVID',
+ video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
webpage, u'title')
# Extract video thumbnail
webpage, u'title')
# Extract description
- video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
+ video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
webpage, u'description', fatal=False)
video_filename = video_url.split('/')[-1]
note='Downloading course info page',
errnote='Unable to download course info page')
- # TODO: implement default_value in search_regex
- m = re.search('<h1>([^<]+)</h1>', coursepage)
- if m:
- info['title'] = unescapeHTML(m.group(1))
- else:
- info['title'] = info['id']
+ info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
- info['description'] = self._search_regex('<description>([^<]+)</description>',
+ info['description'] = self._html_search_regex('<description>([^<]+)</description>',
coursepage, u'description', fatal=False)
- if info['description']: info['description'] = unescapeHTML(info['description'])
links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
info['list'] = [
webpage = self._download_webpage(url, video_id)
- song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
+ song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
webpage, u'song name', fatal=False)
- if song_name: song_name = unescapeHTML(song_name)
- video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
+ video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
webpage, u'title')
- video_title = unescapeHTML(video_title)
- mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
+ mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
webpage, u'mtvn_uri', fatal=False)
content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
webpage, u'video URL')
video_url = compat_urllib_parse.unquote(video_url)
- video_title = self._search_regex(self.VIDEO_TITLE_RE,
+ video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
webpage, u'title')
video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
self.report_extraction(video_id)
# Extract update date
- upload_date = self._search_regex('title="Timestamp">(.*?)</a>',
+ upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
webpage, u'upload date', fatal=False)
if upload_date:
# Convert timestring to a format suitable for filename
upload_date = upload_date.strftime('%Y%m%d')
# Extract uploader
- uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>',
+ uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
webpage, u'uploader', fatal=False)
# Extract title
# Get the first line for title
- # TODO: implement default_value in search_regex
- video_title = u'NA'
- pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
- mobj = re.search(pattern, webpage)
- if mobj:
- video_title = mobj.group(1)
+ video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
+ webpage, 'title', default=u'NA')
# Step 2, Stimulate clicking the image box to launch video
video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
}]
class NBAIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
+ _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
IE_NAME = u'nba'
def _real_extract(self, url):
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group(1)
- if video_id.endswith('/index.html'):
- video_id = video_id[:-len('/index.html')]
webpage = self._download_webpage(url, video_id)
video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
- # TODO: implement default_value in search_regex
- def _findProp(rexp, default=None):
- m = re.search(rexp, webpage)
- if m:
- return unescapeHTML(m.group(1))
- else:
- return default
-
shortened_video_id = video_id.rpartition('/')[2]
- title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
+ title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
+ webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
+
+ # It isn't there in the HTML it returns to us
+ # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
+
+ description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
+
info = {
'id': shortened_video_id,
'url': video_url,
'ext': 'mp4',
'title': title,
- 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
- 'description': _findProp(r'<div class="description">(.*?)</h1>'),
+ # 'uploader_date': uploader_date,
+ 'description': description,
}
return [info]
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
+ video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
webpage, u'video URL', flags=re.DOTALL)
- video_url = unescapeHTML(video_url)
- # TODO: implement fallbacks in regex_search
- m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
- if not m:
- m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
- if not m:
- raise ExtractorError(u'Cannot find video title')
- title = clean_html(m.group('title'))
+ title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
+ r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
- video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
- webpage, u'description', flags=re.DOTALL)
- if video_description: video_description = unescapeHTML(video_description)
+ video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+ webpage, u'description', fatal=False, flags=re.DOTALL)
info = {
'id': video_id,
self.report_extraction(video_id)
- video_title = self._search_regex(r'data-title="(?P<title>.+)"',
+ video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
webpage, u'title')
- uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
+ uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
webpage, u'uploader', fatal=False, flags=re.DOTALL)
- if uploader: uploader = unescapeHTML(uploader.strip())
- thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
+ thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
webpage, u'thumbnail', fatal=False)
info = {
else:
ext = 'flv'
- video_title = self._search_regex(r"<title>(.*)</title>",
+ video_title = self._html_search_regex(r"<title>(.*)</title>",
webpage_src, u'title')
# Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
- thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />',
+ thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
webpage_src, u'thumbnail', fatal=False)
if not thumbnail:
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
- # Get the video title
- video_title = self._search_regex(r'<h1.*?>(?P<title>.*)</h1>',
- webpage, u'title').strip()
-
- # Get the video date
- upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>',
- webpage, u'upload date', fatal=False)
- if upload_date: upload_date = unified_strdate(upload_date.strip())
+ # Get JSON parameters
+ json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
+ try:
+ params = json.loads(json_params)
+ except:
+ raise ExtractorError(u'Invalid JSON')
- # Get the video uploader
- video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>',
- webpage, u'uploader', fatal=False)
- if video_uploader: video_uploader = clean_html(video_uploader.strip())
+ self.report_extraction(video_id)
+ try:
+ video_title = params['title']
+ upload_date = unified_strdate(params['release_date_f'])
+ video_description = params['description']
+ video_uploader = params['submitted_by']
+ thumbnail = params['thumbnails'][0]['image']
+ except KeyError:
+ raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
# Get all of the formats available
DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
size = format[0]
bitrate = format[1]
format = "-".join( format )
- title = u'%s-%s-%s' % (video_title, size, bitrate)
+ # title = u'%s-%s-%s' % (video_title, size, bitrate)
formats.append({
'id': video_id,
'url': video_url,
'uploader': video_uploader,
'upload_date': upload_date,
- 'title': title,
+ 'title': video_title,
'ext': extension,
'format': format,
- 'thumbnail': None,
- 'description': None,
- 'player_url': None
+ 'thumbnail': thumbnail,
+ 'description': video_description
})
if self._downloader.params.get('listformats', None):
#Get the uploaded date
VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
- upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
+ upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
if upload_date: upload_date = unified_strdate(upload_date)
info = {'id': video_id,
webpage = self._download_webpage(url, video_id)
# Get the video title
- video_title = self._search_regex(r'<title>(?P<title>.*)</title>',
+ video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
webpage, u'title').strip()
# Get the embed page
thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
webpage = self._download_webpage(url, video_id)
- video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+ video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
webpage, u'title')
- video_title = unescapeHTML(video_title)
- uploader = self._search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
+ uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
webpage, u'uploader', fatal=False)
- if uploader: uploader = clean_html(uploader)
info = {
'id': video_id,
webpage = self._download_webpage(url, video_id)
- video_title = self._search_regex(r'<div class="module-title">(.*?)</div>',
+ video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
webpage, u'title')
- video_title = unescapeHTML(video_title)
xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
xml_code = self._download_webpage(xml_url, video_id,
video_url = self._search_regex(r'file: "(.*?)",',
webpage, u'video URL')
- video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
- webpage, u'title')
- video_title = unescapeHTML(video_title).replace('LiveLeak.com -', '').strip()
+ video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+ webpage, u'title').replace('LiveLeak.com -', '').strip()
- video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+ video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
webpage, u'description', fatal=False)
- if video_description: video_description = unescapeHTML(video_description)
- video_uploader = self._search_regex(r'By:.*?(\w+)</a>',
+ video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
webpage, u'uploader', fatal=False)
info = {
# The only place where you can get a title, it's not complete,
# but searching in other places doesn't work for all videos
- video_title = self._search_regex(r'<title>(?P<title>.*?)</title>',
+ video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
webpage, u'title', flags=re.DOTALL)
- video_title = unescapeHTML(video_title)
return [{'id': video_id,
'url': video_url,
self.report_extraction(video_id)
- video_url = self._search_regex(r'<source src="(.+?)" type="video/mp4">',
+ video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
webpage, u'video URL')
- video_title = self._search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
+ video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
webpage, u'title')
return [{
self.report_extraction(video_id)
- video_url = self._search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
+ video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
webpage, u'video URL')
video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
webpage, u'video URL')
- video_title = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
+ video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
webpage, u'title')
- video_description = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
+ video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
webpage, u'description', fatal=False)
- thumbnail = self._search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
+ thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
webpage, u'thumbnail', fatal=False)
return [{
self.report_extraction(video_id)
- video_url = self._search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
+ video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
webpage, u'video URL')
- video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
+ video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
webpage, u'title')
- thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
+ thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
webpage, u'thumbnail', fatal=False)
- uploader = self._search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
+ uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
webpage, u'uploader', fatal=False, flags=re.DOTALL)
return [{
first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
- node_id = self._search_regex(r'<Item id="id">(\d+-\d+)</Item>',
+ node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
first_xml, u'node_id')
second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
raise ExtractorError(u'Unable to extract video url')
video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
- video_title = self._search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
+ video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
webpage, u'video title')
- video_description = self._search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
+ video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
webpage, u'description', fatal=False)
- thumbnail = self._search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
+ thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
webpage, u'thumbnail', fatal=False)
return [{
url_title = mobj.group('url_title')
webpage = self._download_webpage(url, url_title)
- video_id = self._search_regex(r'<article class="video" data-id="(\d+?)"',
+ video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
webpage, u'video id')
self.report_extraction(video_id)
- video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
+ video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
webpage, u'title')
- thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)"',
+ thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
webpage, u'thumbnail', fatal=False)
- video_description = self._search_regex(r'<meta property="og:description" content="(.*?)"',
+ video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
webpage, u'description', fatal=False)
data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
- video_url = self._search_regex(r'<file type="high".*?>(.*?)</file>',
+ video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
data, u'video URL')
return [{
'thumbnail': thumbnail,
'description': video_description,
}]
-
+
class XHamsterIE(InfoExtractor):
"""Information Extractor for xHamster"""
_VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- mrss_url='http://xhamster.com/movies/%s/.html' % video_id
+ mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
webpage = self._download_webpage(mrss_url, video_id)
+
mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract media URL')
video_url = mobj.group('server')+'/key='+mobj.group('file')
video_extension = video_url.split('.')[-1]
- mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_title = unescapeHTML(mobj.group('title'))
+ video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
+ webpage, u'title')
- mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
- if mobj is None:
- video_description = u''
- else:
- video_description = unescapeHTML(mobj.group('description'))
+ # Can't see the description anywhere in the UI
+ # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
+ # webpage, u'description', fatal=False)
+ # if video_description: video_description = unescapeHTML(video_description)
mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract upload date')
- video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
-
- mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
- if mobj is None:
- video_uploader_id = u'anonymous'
+ if mobj:
+ video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
else:
- video_uploader_id = mobj.group('uploader_id')
+ video_upload_date = None
+ self._downloader.report_warning(u'Unable to extract upload date')
- mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract thumbnail URL')
- video_thumbnail = mobj.group('thumbnail')
+ video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
+ webpage, u'uploader id', default=u'anonymous')
+
+ video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
+ webpage, u'thumbnail', fatal=False)
return [{
'id': video_id,
'url': video_url,
'ext': video_extension,
'title': video_title,
- 'description': video_description,
+ # 'description': video_description,
'upload_date': video_upload_date,
'uploader_id': video_uploader_id,
'thumbnail': video_thumbnail
cookie = urlh.headers.get('Set-Cookie', '')
self.report_extraction(track_id)
- mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
- if mobj is None:
- raise ExtractorError(u'Unable to extrack tracks')
- html_tracks = mobj.group(1).strip()
+
+ html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
+ response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
try:
track_list = json.loads(html_tracks)
track = track_list[u'tracks'][0]