youtube_dl/extractor/pornhub.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import functools
   5 import itertools
   6 import operator
   7 import re
   8
   9 from .common import InfoExtractor
  10 from ..compat import (
  11     compat_HTTPError,
  12     compat_str,
  13 )
  14 from ..utils import (
  15     ExtractorError,
  16     int_or_none,
  17     js_to_json,
  18     orderedSet,
  19     remove_quotes,
  20     str_to_int,
  21     url_or_none,
  22 )
  23
  24
  25 class PornHubIE(InfoExtractor):
  26     IE_DESC = 'PornHub and Thumbzilla'
  27     _VALID_URL = r'''(?x)
  28                     https?://
  29                         (?:
  30                             (?:[^/]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
  31                             (?:www\.)?thumbzilla\.com/video/
  32                         )
  33                         (?P<id>[\da-z]+)
  34                     '''
  35     _TESTS = [{
  36         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
  37         'md5': '1e19b41231a02eba417839222ac9d58e',
  38         'info_dict': {
  39             'id': '648719015',
  40             'ext': 'mp4',
  41             'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
  42             'uploader': 'Babes',
  43             'upload_date': '20130628',
  44             'duration': 361,
  45             'view_count': int,
  46             'like_count': int,
  47             'dislike_count': int,
  48             'comment_count': int,
  49             'age_limit': 18,
  50             'tags': list,
  51             'categories': list,
  52         },
  53     }, {
  54         # non-ASCII title
  55         'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
  56         'info_dict': {
  57             'id': '1331683002',
  58             'ext': 'mp4',
  59             'title': '重庆婷婷女王足交',
  60             'uploader': 'Unknown',
  61             'upload_date': '20150213',
  62             'duration': 1753,
  63             'view_count': int,
  64             'like_count': int,
  65             'dislike_count': int,
  66             'comment_count': int,
  67             'age_limit': 18,
  68             'tags': list,
  69             'categories': list,
  70         },
  71         'params': {
  72             'skip_download': True,
  73         },
  74     }, {
  75         # subtitles
  76         'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
  77         'info_dict': {
  78             'id': 'ph5af5fef7c2aa7',
  79             'ext': 'mp4',
  80             'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
  81             'uploader': 'BFFs',
  82             'duration': 622,
  83             'view_count': int,
  84             'like_count': int,
  85             'dislike_count': int,
  86             'comment_count': int,
  87             'age_limit': 18,
  88             'tags': list,
  89             'categories': list,
  90             'subtitles': {
  91                 'en': [{
  92                     "ext": 'srt'
  93                 }]
  94             },
  95         },
  96         'params': {
  97             'skip_download': True,
  98         },
  99     }, {
 100         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
 101         'only_matching': True,
 102     }, {
 103         # removed at the request of cam4.com
 104         'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
 105         'only_matching': True,
 106     }, {
 107         # removed at the request of the copyright owner
 108         'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
 109         'only_matching': True,
 110     }, {
 111         # removed by uploader
 112         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
 113         'only_matching': True,
 114     }, {
 115         # private video
 116         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
 117         'only_matching': True,
 118     }, {
 119         'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
 120         'only_matching': True,
 121     }, {
 122         'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
 123         'only_matching': True,
 124     }]
 125
 126     @staticmethod
 127     def _extract_urls(webpage):
 128         return re.findall(
 129             r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/[\da-z]+)',
 130             webpage)
 131
 132     def _extract_count(self, pattern, webpage, name):
 133         return str_to_int(self._search_regex(
 134             pattern, webpage, '%s count' % name, fatal=False))
 135
 136     def _real_extract(self, url):
 137         video_id = self._match_id(url)
 138
 139         self._set_cookie('pornhub.com', 'age_verified', '1')
 140
 141         def dl_webpage(platform):
 142             self._set_cookie('pornhub.com', 'platform', platform)
 143             return self._download_webpage(
 144                 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id,
 145                 video_id, 'Downloading %s webpage' % platform)
 146
 147         webpage = dl_webpage('pc')
 148
 149         error_msg = self._html_search_regex(
 150             r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
 151             webpage, 'error message', default=None, group='error')
 152         if error_msg:
 153             error_msg = re.sub(r'\s+', ' ', error_msg)
 154             raise ExtractorError(
 155                 'PornHub said: %s' % error_msg,
 156                 expected=True, video_id=video_id)
 157
 158         # video_title from flashvars contains whitespace instead of non-ASCII (see
 159         # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
 160         # on that anymore.
 161         title = self._html_search_meta(
 162             'twitter:title', webpage, default=None) or self._search_regex(
 163             (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
 164              r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
 165              r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
 166             webpage, 'title', group='title')
 167
 168         video_urls = []
 169         video_urls_set = set()
 170         subtitles = {}
 171
 172         flashvars = self._parse_json(
 173             self._search_regex(
 174                 r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
 175             video_id)
 176         if flashvars:
 177             subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
 178             if subtitle_url:
 179                 subtitles.setdefault('en', []).append({
 180                     'url': subtitle_url,
 181                     'ext': 'srt',
 182                 })
 183             thumbnail = flashvars.get('image_url')
 184             duration = int_or_none(flashvars.get('video_duration'))
 185             media_definitions = flashvars.get('mediaDefinitions')
 186             if isinstance(media_definitions, list):
 187                 for definition in media_definitions:
 188                     if not isinstance(definition, dict):
 189                         continue
 190                     video_url = definition.get('videoUrl')
 191                     if not video_url or not isinstance(video_url, compat_str):
 192                         continue
 193                     if video_url in video_urls_set:
 194                         continue
 195                     video_urls_set.add(video_url)
 196                     video_urls.append(
 197                         (video_url, int_or_none(definition.get('quality'))))
 198         else:
 199             thumbnail, duration = [None] * 2
 200
 201         if not video_urls:
 202             tv_webpage = dl_webpage('tv')
 203
 204             assignments = self._search_regex(
 205                 r'(var.+?mediastring.+?)</script>', tv_webpage,
 206                 'encoded url').split(';')
 207
 208             js_vars = {}
 209
 210             def parse_js_value(inp):
 211                 inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
 212                 if '+' in inp:
 213                     inps = inp.split('+')
 214                     return functools.reduce(
 215                         operator.concat, map(parse_js_value, inps))
 216                 inp = inp.strip()
 217                 if inp in js_vars:
 218                     return js_vars[inp]
 219                 return remove_quotes(inp)
 220
 221             for assn in assignments:
 222                 assn = assn.strip()
 223                 if not assn:
 224                     continue
 225                 assn = re.sub(r'var\s+', '', assn)
 226                 vname, value = assn.split('=', 1)
 227                 js_vars[vname] = parse_js_value(value)
 228
 229             video_url = js_vars['mediastring']
 230             if video_url not in video_urls_set:
 231                 video_urls.append((video_url, None))
 232                 video_urls_set.add(video_url)
 233
 234         for mobj in re.finditer(
 235                 r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
 236                 webpage):
 237             video_url = mobj.group('url')
 238             if video_url not in video_urls_set:
 239                 video_urls.append((video_url, None))
 240                 video_urls_set.add(video_url)
 241
 242         upload_date = None
 243         formats = []
 244         for video_url, height in video_urls:
 245             if not upload_date:
 246                 upload_date = self._search_regex(
 247                     r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
 248                 if upload_date:
 249                     upload_date = upload_date.replace('/', '')
 250             tbr = None
 251             mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url)
 252             if mobj:
 253                 if not height:
 254                     height = int(mobj.group('height'))
 255                 tbr = int(mobj.group('tbr'))
 256             formats.append({
 257                 'url': video_url,
 258                 'format_id': '%dp' % height if height else None,
 259                 'height': height,
 260                 'tbr': tbr,
 261             })
 262         self._sort_formats(formats)
 263
 264         video_uploader = self._html_search_regex(
 265             r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
 266             webpage, 'uploader', fatal=False)
 267
 268         view_count = self._extract_count(
 269             r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
 270         like_count = self._extract_count(
 271             r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
 272         dislike_count = self._extract_count(
 273             r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
 274         comment_count = self._extract_count(
 275             r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
 276
 277         page_params = self._parse_json(self._search_regex(
 278             r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})',
 279             webpage, 'page parameters', group='data', default='{}'),
 280             video_id, transform_source=js_to_json, fatal=False)
 281         tags = categories = None
 282         if page_params:
 283             tags = page_params.get('tags', '').split(',')
 284             categories = page_params.get('categories', '').split(',')
 285
 286         return {
 287             'id': video_id,
 288             'uploader': video_uploader,
 289             'upload_date': upload_date,
 290             'title': title,
 291             'thumbnail': thumbnail,
 292             'duration': duration,
 293             'view_count': view_count,
 294             'like_count': like_count,
 295             'dislike_count': dislike_count,
 296             'comment_count': comment_count,
 297             'formats': formats,
 298             'age_limit': 18,
 299             'tags': tags,
 300             'categories': categories,
 301             'subtitles': subtitles,
 302         }
 303
 304
 305 class PornHubPlaylistBaseIE(InfoExtractor):
 306     def _extract_entries(self, webpage):
 307         # Only process container div with main playlist content skipping
 308         # drop-down menu that uses similar pattern for videos (see
 309         # https://github.com/rg3/youtube-dl/issues/11594).
 310         container = self._search_regex(
 311             r'(?s)(<div[^>]+class=["\']container.+)', webpage,
 312             'container', default=webpage)
 313
 314         return [
 315             self.url_result(
 316                 'http://www.pornhub.com/%s' % video_url,
 317                 PornHubIE.ie_key(), video_title=title)
 318             for video_url, title in orderedSet(re.findall(
 319                 r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
 320                 container))
 321         ]
 322
 323     def _real_extract(self, url):
 324         playlist_id = self._match_id(url)
 325
 326         webpage = self._download_webpage(url, playlist_id)
 327
 328         entries = self._extract_entries(webpage)
 329
 330         playlist = self._parse_json(
 331             self._search_regex(
 332                 r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage,
 333                 'playlist', default='{}'),
 334             playlist_id, fatal=False)
 335         title = playlist.get('title') or self._search_regex(
 336             r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False)
 337
 338         return self.playlist_result(
 339             entries, playlist_id, title, playlist.get('description'))
 340
 341
 342 class PornHubPlaylistIE(PornHubPlaylistBaseIE):
 343     _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/playlist/(?P<id>\d+)'
 344     _TESTS = [{
 345         'url': 'http://www.pornhub.com/playlist/4667351',
 346         'info_dict': {
 347             'id': '4667351',
 348             'title': 'Nataly Hot',
 349         },
 350         'playlist_mincount': 2,
 351     }, {
 352         'url': 'https://de.pornhub.com/playlist/4667351',
 353         'only_matching': True,
 354     }]
 355
 356
 357 class PornHubUserVideosIE(PornHubPlaylistBaseIE):
 358     _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos'
 359     _TESTS = [{
 360         'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
 361         'info_dict': {
 362             'id': 'zoe_ph',
 363         },
 364         'playlist_mincount': 171,
 365     }, {
 366         'url': 'http://www.pornhub.com/users/rushandlia/videos',
 367         'only_matching': True,
 368     }, {
 369         # default sorting as Top Rated Videos
 370         'url': 'https://www.pornhub.com/channels/povd/videos',
 371         'info_dict': {
 372             'id': 'povd',
 373         },
 374         'playlist_mincount': 293,
 375     }, {
 376         # Top Rated Videos
 377         'url': 'https://www.pornhub.com/channels/povd/videos?o=ra',
 378         'only_matching': True,
 379     }, {
 380         # Most Recent Videos
 381         'url': 'https://www.pornhub.com/channels/povd/videos?o=da',
 382         'only_matching': True,
 383     }, {
 384         # Most Viewed Videos
 385         'url': 'https://www.pornhub.com/channels/povd/videos?o=vi',
 386         'only_matching': True,
 387     }, {
 388         'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
 389         'only_matching': True,
 390     }, {
 391         'url': 'https://www.pornhub.com/model/jayndrea/videos/upload',
 392         'only_matching': True,
 393     }, {
 394         'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
 395         'only_matching': True,
 396     }]
 397
 398     def _real_extract(self, url):
 399         user_id = self._match_id(url)
 400
 401         entries = []
 402         for page_num in itertools.count(1):
 403             try:
 404                 webpage = self._download_webpage(
 405                     url, user_id, 'Downloading page %d' % page_num,
 406                     query={'page': page_num})
 407             except ExtractorError as e:
 408                 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
 409                     break
 410                 raise
 411             page_entries = self._extract_entries(webpage)
 412             if not page_entries:
 413                 break
 414             entries.extend(page_entries)
 415
 416         return self.playlist_result(entries, user_id)