yt-dlc/youtube_dl/extractor/orf.py

import re
import xml.etree.ElementTree
import json

from .common import InfoExtractor
from ..utils import (
    compat_urlparse,
    ExtractorError,
    find_xpath_attr,
)

class ORFIE(InfoExtractor):
    _VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)'

    _TEST = {
        u'url': u'http://tvthek.orf.at/programs/1171769-Wetter-ZIB/episodes/6557323-Wetter',
        u'file': u'6566957.flv',
        u'info_dict': {
            u'title': u'Wetter',
            u'description': u'Christa Kummer, Marcus Wadsak und Kollegen  präsentieren abwechselnd ihre täglichen Wetterprognosen für Österreich.\r \r Mehr Wetter unter wetter.ORF.at',
        },
        u'params': {
            # It uses rtmp
            u'skip_download': True,
        }
    }

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        playlist_id = mobj.group('id')
        webpage = self._download_webpage(url, playlist_id)

        flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml')
        flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0]
        flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8'))
        playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"')
        playlist = json.loads(playlist_json)

        videos = []
        ns = '{http://tempuri.org/XMLSchema.xsd}'
        xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns}
        webpage_description = self._og_search_description(webpage)
        for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1):
            # Get best quality url
            rtmp_url = None
            for q in ['Q6A', 'Q4A', 'Q1A']:
                video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q)
                if video_url is not None:
                    rtmp_url = video_url.text
                    break
            if rtmp_url is None:
                raise ExtractorError(u'Couldn\'t get video url: %s' % info['id'])
            description = self._html_search_regex(
                r'id="playlist_entry_%s".*?<p>(.*?)</p>' % i, webpage,
                u'description', default=webpage_description, flags=re.DOTALL)
            videos.append({
                '_type': 'video',
                'id': info['id'],
                'title': info['title'],
                'url': rtmp_url,
                'ext': 'flv',
                'description': description,
                })

        return videos
Add an extractor for orf.at (closes #1346) Make find_xpath_attr also accept numbers in the value 2013-08-29 19:16:07 +02:00			`import re`
			`import xml.etree.ElementTree`
			`import json`

			`from .common import InfoExtractor`
			`from ..utils import (`
			`compat_urlparse,`
			`ExtractorError,`
			`find_xpath_attr,`
			`)`

			`class ORFIE(InfoExtractor):`
			`_VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes\|topics/.+?)/(?P<id>\d+)'`

			`_TEST = {`
			`u'url': u'http://tvthek.orf.at/programs/1171769-Wetter-ZIB/episodes/6557323-Wetter',`
			`u'file': u'6566957.flv',`
			`u'info_dict': {`
			`u'title': u'Wetter',`
			`u'description': u'Christa Kummer, Marcus Wadsak und Kollegen präsentieren abwechselnd ihre täglichen Wetterprognosen für Österreich.\r \r Mehr Wetter unter wetter.ORF.at',`
			`},`
			`u'params': {`
			`# It uses rtmp`
			`u'skip_download': True,`
			`}`
			`}`

			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
			`playlist_id = mobj.group('id')`
			`webpage = self._download_webpage(url, playlist_id)`

			`flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml')`
			`flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0]`
			`flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8'))`
			`playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"')`
			`playlist = json.loads(playlist_json)`

			`videos = []`
			`ns = '{http://tempuri.org/XMLSchema.xsd}'`
			`xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns}`
			`webpage_description = self._og_search_description(webpage)`
			`for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1):`
			`# Get best quality url`
			`rtmp_url = None`
			`for q in ['Q6A', 'Q4A', 'Q1A']:`
			`video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q)`
			`if video_url is not None:`
			`rtmp_url = video_url.text`
			`break`
			`if rtmp_url is None:`
			`raise ExtractorError(u'Couldn\'t get video url: %s' % info['id'])`
			`description = self._html_search_regex(`
			`r'id="playlist_entry_%s".?<p>(.?)</p>' % i, webpage,`
			`u'description', default=webpage_description, flags=re.DOTALL)`
			`videos.append({`
			`'_type': 'video',`
			`'id': info['id'],`
			`'title': info['title'],`
			`'url': rtmp_url,`
			`'ext': 'flv',`
			`'description': description,`
			`})`

			`return videos`