[WDR] complete overhaul after relaunch of the site

The WDR relaunched their site on 2016-02-23 which not only changed the
URL-schema completely but also the layout of their pages.

Apparently the whole "mediathek" now runs on the wdr-domain, so no
separate URL for funkhauseuropa anymore.
There seems to be no explicit handling of video-sizes on the page or in
the URLs anymore. There seems to be only one size for HTML5, but still
several sizes for flash. The extractor adds all to the list of formats.

There is no metadata for the HTML5-stream, so that the best flash-stream
will always be considered as the "best" format. At least in my tests
this seemed to be true anyway.
This commit is contained in:
Boris Wachtmeister 2016-03-12 18:00:26 +01:00
parent 29a7e8f6f8
commit c0837a12c8
1 changed files with 101 additions and 150 deletions

View File

@ -1,7 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import unicode_literals from __future__ import unicode_literals
import itertools
import re import re
from .common import InfoExtractor from .common import InfoExtractor
@ -11,204 +10,156 @@ from ..compat import (
) )
from ..utils import ( from ..utils import (
unified_strdate, unified_strdate,
qualities, ExtractorError,
) )
class WDRIE(InfoExtractor): class WDRIE(InfoExtractor):
_PLAYER_REGEX = '-(?:video|audio)player(?:_size-[LMS])?' _PAGE_REGEX = r'/mediathek/(?P<media_type>[^/]+)/(?P<type>[^/]+)/(?P<display_id>.+)\.html'
_VALID_URL = r'(?P<url>https?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P<id>.+?)(?P<player>%s)?\.html' % _PLAYER_REGEX _VALID_URL = r'(?P<page_url>https?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX
_JS_URL_REGEX = r'(https?://deviceids-medp.wdr.de/ondemand/\d+/\d+\.js)'
_TESTS = [ _TESTS = [
{ {
'url': 'http://www1.wdr.de/mediathek/video/sendungen/servicezeit/videoservicezeit560-videoplayer_size-L.html', 'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html',
'md5': 'e58c39c3e30077141d258bf588700a7b',
'info_dict': { 'info_dict': {
'id': 'mdb-362427', 'id': 'mdb-1058683',
'ext': 'flv', 'ext': 'flv',
'title': 'Servicezeit', 'display_id': 'doku-am-freitag/video-geheimnis-aachener-dom-100',
'description': 'md5:c8f43e5e815eeb54d0b96df2fba906cb', 'title': 'Geheimnis Aachener Dom',
'upload_date': '20140310', 'alt_title': 'Doku am Freitag',
'is_live': False 'upload_date': '20160304',
}, 'description': 'md5:87be8ff14d8dfd7a7ee46f0299b52318',
'params': { 'is_live': False,
'skip_download': True, 'subtitles': {'de': [{
'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml'
}]},
}, },
'skip': 'Page Not Found', 'skip': 'Page Not Found',
}, },
{ {
'url': 'http://www1.wdr.de/themen/av/videomargaspiegelisttot101-videoplayer.html', 'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html',
'md5': 'f4c1f96d01cf285240f53ea4309663d8',
'info_dict': { 'info_dict': {
'id': 'mdb-363194', 'id': 'mdb-1072000',
'ext': 'mp3',
'display_id': 'wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100',
'title': 'Schriftstellerin Juli Zeh',
'alt_title': 'WDR 3 Gespräch am Samstag',
'upload_date': '20160312',
'description': 'md5:e127d320bc2b1f149be697ce044a3dd7',
'is_live': False,
'subtitles': {}
},
'skip': 'Page Not Found',
},
{
'url': 'http://www1.wdr.de/mediathek/video/live/index.html',
'info_dict': {
'id': 'mdb-103364',
'ext': 'flv', 'ext': 'flv',
'title': 'Marga Spiegel ist tot', 'display_id': 'index',
'description': 'md5:2309992a6716c347891c045be50992e4', 'title': r're:^WDR Fernsehen im Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'upload_date': '20140311', 'alt_title': 'WDR Fernsehen Live',
'is_live': False 'upload_date': None,
}, 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9',
'params': { 'is_live': True,
'skip_download': True, 'subtitles': {}
},
'skip': 'Page Not Found',
},
{
'url': 'http://www1.wdr.de/themen/kultur/audioerlebtegeschichtenmargaspiegel100-audioplayer.html',
'md5': '83e9e8fefad36f357278759870805898',
'info_dict': {
'id': 'mdb-194332',
'ext': 'mp3',
'title': 'Erlebte Geschichten: Marga Spiegel (29.11.2009)',
'description': 'md5:2309992a6716c347891c045be50992e4',
'upload_date': '20091129',
'is_live': False
},
},
{
'url': 'http://www.funkhauseuropa.de/av/audioflaviacoelhoamaramar100-audioplayer.html',
'md5': '99a1443ff29af19f6c52cf6f4dc1f4aa',
'info_dict': {
'id': 'mdb-478135',
'ext': 'mp3',
'title': 'Flavia Coelho: Amar é Amar',
'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a',
'upload_date': '20140717',
'is_live': False
},
'skip': 'Page Not Found',
},
{
'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html',
'playlist_mincount': 146,
'info_dict': {
'id': 'mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100',
} }
}, },
{ {
'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html', 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html',
'playlist_mincount': 10,
'info_dict': { 'info_dict': {
'id': 'mdb-103364', 'id': 'aktuelle-stunde/aktuelle-stunde-120',
'title': 're:^WDR Fernsehen Live [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9',
'ext': 'flv',
'upload_date': '20150101',
'is_live': True
},
'params': {
'skip_download': True,
}, },
} }
] ]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
page_url = mobj.group('url') url_type = mobj.group('type')
page_id = mobj.group('id') page_url = mobj.group('page_url')
display_id = mobj.group('display_id')
webpage = self._download_webpage(url, display_id)
webpage = self._download_webpage(url, page_id) js_url = self._search_regex(self._JS_URL_REGEX, webpage, 'js_url', default=None)
if mobj.group('player') is None: if not js_url:
entries = [ entries = [
self.url_result(page_url + href, 'WDR') self.url_result(page_url + href[0], 'WDR')
for href in re.findall( for href in re.findall(
r'<a href="/?(.+?%s\.html)" rel="nofollow"' % self._PLAYER_REGEX, r'<a href="(%s)"' % self._PAGE_REGEX,
webpage) webpage)
] ]
if entries: # Playlist page if entries: # Playlist page
return self.playlist_result(entries, page_id) return self.playlist_result(entries, playlist_id=display_id)
# Overview page raise ExtractorError('No downloadable streams found', expected=True)
entries = []
for page_num in itertools.count(2):
hrefs = re.findall(
r'<li class="mediathekvideo"\s*>\s*<img[^>]*>\s*<a href="(/mediathek/video/[^"]+)"',
webpage)
entries.extend(
self.url_result(page_url + href, 'WDR')
for href in hrefs)
next_url_m = re.search(
r'<li class="nextToLast">\s*<a href="([^"]+)"', webpage)
if not next_url_m:
break
next_url = page_url + next_url_m.group(1)
webpage = self._download_webpage(
next_url, page_id,
note='Downloading playlist page %d' % page_num)
return self.playlist_result(entries, page_id)
flashvars = compat_parse_qs(self._html_search_regex( js_data = self._download_webpage(js_url, 'metadata')
r'<param name="flashvars" value="([^"]+)"', webpage, 'flashvars')) json_data = self._search_regex(r'\(({.*})\)', js_data, 'json')
metadata = self._parse_json(json_data, display_id)
page_id = flashvars['trackerClipId'][0] metadata_tracker_data = metadata["trackerData"]
video_url = flashvars['dslSrc'][0] metadata_media_resource = metadata["mediaResource"]
title = flashvars['trackerClipTitle'][0]
thumbnail = flashvars['startPicture'][0] if 'startPicture' in flashvars else None formats = []
is_live = flashvars.get('isLive', ['0'])[0] == '1'
# check if the metadata contains a direct URL to a file
metadata_media_alt = metadata_media_resource.get("alt")
if metadata_media_alt:
for tag_name in ["videoURL", 'audioURL']:
if tag_name in metadata_media_alt:
formats.append({
'url': metadata_media_alt[tag_name]
})
# check if there are flash-streams for this video
if "dflt" in metadata_media_resource and "videoURL" in metadata_media_resource["dflt"]:
video_url = metadata_media_resource["dflt"]["videoURL"]
if video_url.endswith('.f4m'):
full_video_url = video_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18'
formats.extend(self._extract_f4m_formats(full_video_url, display_id, f4m_id='hds', fatal=False))
elif video_url.endswith('.smil'):
formats.extend(self._extract_smil_formats(video_url, 'stream', fatal=False))
subtitles = {}
caption_url = metadata_media_resource.get("captionURL")
if caption_url:
subtitles['de'] = [{
'url': caption_url
}]
title = metadata_tracker_data.get("trackerClipTitle")
is_live = url_type == 'live'
if is_live: if is_live:
title = self._live_title(title) title = self._live_title(title)
upload_date = None
if 'trackerClipAirTime' in flashvars: elif 'trackerClipAirTime' in metadata_tracker_data:
upload_date = flashvars['trackerClipAirTime'][0] upload_date = metadata_tracker_data['trackerClipAirTime']
else: else:
upload_date = self._html_search_meta( upload_date = self._html_search_meta('DC.Date', webpage, 'upload date')
'DC.Date', webpage, 'upload date')
if upload_date: if upload_date:
upload_date = unified_strdate(upload_date) upload_date = unified_strdate(upload_date)
formats = []
preference = qualities(['S', 'M', 'L', 'XL'])
if video_url.endswith('.f4m'):
formats.extend(self._extract_f4m_formats(
video_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', page_id,
f4m_id='hds', fatal=False))
elif video_url.endswith('.smil'):
formats.extend(self._extract_smil_formats(
video_url, page_id, False, {
'hdcore': '3.3.0',
'plugin': 'aasp-3.3.0.99.43',
}))
else:
formats.append({
'url': video_url,
'http_headers': {
'User-Agent': 'mobile',
},
})
m3u8_url = self._search_regex(
r'rel="adaptiv"[^>]+href="([^"]+)"',
webpage, 'm3u8 url', default=None)
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, page_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
direct_urls = re.findall(
r'rel="web(S|M|L|XL)"[^>]+href="([^"]+)"', webpage)
if direct_urls:
for quality, video_url in direct_urls:
formats.append({
'url': video_url,
'preference': preference(quality),
'http_headers': {
'User-Agent': 'mobile',
},
})
self._sort_formats(formats) self._sort_formats(formats)
description = self._html_search_meta('Description', webpage, 'description')
return { return {
'id': page_id, 'id': metadata_tracker_data.get("trackerClipId", display_id),
'formats': formats, 'display_id': display_id,
'title': title, 'title': title,
'description': description, 'alt_title': metadata_tracker_data.get("trackerClipSubcategory"),
'thumbnail': thumbnail, 'formats': formats,
'upload_date': upload_date, 'upload_date': upload_date,
'is_live': is_live 'description': self._html_search_meta("Description", webpage),
'is_live': is_live,
'subtitles': subtitles,
} }