mirror of https://github.com/blackjack4494/yt-dlc
[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags
This commit is contained in:
parent
55af45fcab
commit
520251c093
|
@ -1,6 +1,7 @@
|
||||||
version <unreleased>
|
version <unreleased>
|
||||||
|
|
||||||
Core
|
Core
|
||||||
|
* Support m3u8 manifests in HTML5 multimedia tags
|
||||||
* Fix js_to_json(): correct octal or hexadecimal number detection
|
* Fix js_to_json(): correct octal or hexadecimal number detection
|
||||||
|
|
||||||
Extractors
|
Extractors
|
||||||
|
|
|
@ -1695,7 +1695,7 @@ class InfoExtractor(object):
|
||||||
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
|
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
|
||||||
return formats
|
return formats
|
||||||
|
|
||||||
def _parse_html5_media_entries(self, base_url, webpage):
|
def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None):
|
||||||
def absolute_url(video_url):
|
def absolute_url(video_url):
|
||||||
return compat_urlparse.urljoin(base_url, video_url)
|
return compat_urlparse.urljoin(base_url, video_url)
|
||||||
|
|
||||||
|
@ -1710,6 +1710,21 @@ class InfoExtractor(object):
|
||||||
return f
|
return f
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
def _media_formats(src, cur_media_type):
|
||||||
|
full_url = absolute_url(src)
|
||||||
|
if determine_ext(full_url) == 'm3u8':
|
||||||
|
is_plain_url = False
|
||||||
|
formats = self._extract_m3u8_formats(
|
||||||
|
full_url, video_id, ext='mp4', entry_protocol='m3u8_native',
|
||||||
|
m3u8_id=m3u8_id)
|
||||||
|
else:
|
||||||
|
is_plain_url = True
|
||||||
|
formats = [{
|
||||||
|
'url': full_url,
|
||||||
|
'vcodec': 'none' if cur_media_type == 'audio' else None,
|
||||||
|
}]
|
||||||
|
return is_plain_url, formats
|
||||||
|
|
||||||
entries = []
|
entries = []
|
||||||
for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
|
for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
|
||||||
media_info = {
|
media_info = {
|
||||||
|
@ -1719,10 +1734,8 @@ class InfoExtractor(object):
|
||||||
media_attributes = extract_attributes(media_tag)
|
media_attributes = extract_attributes(media_tag)
|
||||||
src = media_attributes.get('src')
|
src = media_attributes.get('src')
|
||||||
if src:
|
if src:
|
||||||
media_info['formats'].append({
|
_, formats = _media_formats(src)
|
||||||
'url': absolute_url(src),
|
media_info['formats'].extend(formats)
|
||||||
'vcodec': 'none' if media_type == 'audio' else None,
|
|
||||||
})
|
|
||||||
media_info['thumbnail'] = media_attributes.get('poster')
|
media_info['thumbnail'] = media_attributes.get('poster')
|
||||||
if media_content:
|
if media_content:
|
||||||
for source_tag in re.findall(r'<source[^>]+>', media_content):
|
for source_tag in re.findall(r'<source[^>]+>', media_content):
|
||||||
|
@ -1730,12 +1743,13 @@ class InfoExtractor(object):
|
||||||
src = source_attributes.get('src')
|
src = source_attributes.get('src')
|
||||||
if not src:
|
if not src:
|
||||||
continue
|
continue
|
||||||
|
is_plain_url, formats = _media_formats(src, media_type)
|
||||||
|
if is_plain_url:
|
||||||
f = parse_content_type(source_attributes.get('type'))
|
f = parse_content_type(source_attributes.get('type'))
|
||||||
f.update({
|
f.update(formats[0])
|
||||||
'url': absolute_url(src),
|
|
||||||
'vcodec': 'none' if media_type == 'audio' else None,
|
|
||||||
})
|
|
||||||
media_info['formats'].append(f)
|
media_info['formats'].append(f)
|
||||||
|
else:
|
||||||
|
media_info['formats'].extend(formats)
|
||||||
for track_tag in re.findall(r'<track[^>]+>', media_content):
|
for track_tag in re.findall(r'<track[^>]+>', media_content):
|
||||||
track_attributes = extract_attributes(track_tag)
|
track_attributes = extract_attributes(track_tag)
|
||||||
kind = track_attributes.get('kind')
|
kind = track_attributes.get('kind')
|
||||||
|
|
Loading…
Reference in New Issue