handle titles and captions set to ''

[youtube-dl.git] / youtube_dl / extractor / bbc.py
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py

index 72e20857bf607b325c3e14cd1ebd67e41cddc235..c910eb55afc16cca496f57553e12fac9a0c85c62 100644 (file)
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -255,26 +255,11 @@ class BBCCoUkIE(InfoExtractor):
          for connection in self._extract_connections(media):
              captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
              lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
-            ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
-            srt = ''
-
-            def _extract_text(p):
-                if p.text is not None:
-                    stripped_text = p.text.strip()
-                    if stripped_text:
-                        return stripped_text
-                return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span'))
-            for pos, p in enumerate(ps):
-                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))
              subtitles[lang] = [
                  {
                      'url': connection.get('href'),
                      'ext': 'ttml',
                  },
-                {
-                    'data': srt,
-                    'ext': 'srt',
-                },
              ]
          return subtitles
  
@@ -428,6 +413,8 @@ class BBCNewsIE(BBCCoUkIE):
              'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
              'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
              'duration': 47,
+            'upload_date': '20150324',
+            'uploader': 'BBC News',
          },
          'params': {
              'skip_download': True,
@@ -438,8 +425,11 @@ class BBCNewsIE(BBCCoUkIE):
          'info_dict': {
              'id': 'NA',
              'ext': 'mp4',
-            'title': 'YPG - Tel Abyad..n tamam. kontrol.m.zde',
+            'title': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde',
+            'description': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde',
              'duration': 47,
+            'upload_date': '20150615',
+            'uploader': 'BBC News',
          },
          'params': {
              'skip_download': True,
@@ -450,8 +440,11 @@ class BBCNewsIE(BBCCoUkIE):
          'info_dict': {
              'id': '39275083',
              'ext': 'mp4',
-            'title': 'Honduras militariza sus hospitales por nuevo esc.ndalo de corrupci.n',
+            'title': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n',
+            'description': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n',
              'duration': 87,
+            'upload_date': '20150619',
+            'uploader': 'BBC News',
          },
          'params': {
              'skip_download': True,
@@ -502,12 +495,16 @@ class BBCNewsIE(BBCCoUkIE):
  
          for jent in jsent:
              programme_id = jent.get('externalId')
-            xml_url = jent.get('hxref')
+            xml_url = jent.get('href')
  
-            title = jent.get('caption',list_title)
+            title = jent.get('caption','')
+            if title == '':
+               title = list_title
  
              duration = parse_duration(jent.get('duration'))
-            description = list_title + ' - ' + jent.get('caption','')
+            description = list_title
+            if jent.get('caption', '') != '':
+               description += ' - ' + jent.get('caption')
              thumbnail = None
              if jent.has_key('image'):
                 thumbnail=jent['image'].get('href')
@@ -539,8 +536,12 @@ class BBCNewsIE(BBCCoUkIE):
                 
              self._sort_formats(formats)
  
+            id = jent.get('id') if programme_id == None else programme_id
+            if id == None:
+               id = 'NA'
+
              ret.append( {
-                'id': jent.get('programme_id',jent.get('id')),
+                'id': id,
                  'uploader': 'BBC News',
                  'upload_date': pubdate,
                  'title': title,