[Senate] Add new extractor (#5302)

2015-04-21 02:29:56 +08:00 · 2015-04-21 02:29:56 +08:00 · c6391cd587
parent 006ce15a0c
commit c6391cd587
3 changed files with 132 additions and 0 deletions
--- a/youtube_dl/downloader/f4m.py
+++ b/youtube_dl/downloader/f4m.py
@ -389,6 +389,8 @@ class F4mFD(FileDownloader):
            url = base_url + name
            if akamai_pv:
                url += '?' + akamai_pv.strip(';')
            if info_dict.get('extra_param_to_segment_url'):
                url += info_dict.get('extra_param_to_segment_url')
            frag_filename = '%s-%s' % (tmpfilename, name)
            try:
                success = http_dl.download(frag_filename, {'url': url})
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -447,6 +447,7 @@ from .scivee import SciVeeIE
 from .screencast import ScreencastIE
 from .screencastomatic import ScreencastOMaticIE
 from .screenwavemedia import CinemassacreIE, ScreenwaveMediaIE, TeamFourIE
 from .senateisvp import SenateISVPIE
 from .servingsys import ServingSysIE
 from .sexu import SexuIE
 from .sexykarma import SexyKarmaIE
--- a/youtube_dl/extractor/senateisvp.py
+++ b/youtube_dl/extractor/senateisvp.py
@ -0,0 +1,129 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..utils import ExtractorError
 from ..compat import (
    compat_parse_qs,
    compat_urlparse,
 )
 class SenateISVPIE(InfoExtractor):
    _COMM_MAP = [
        ["ag", "76440", "http://ag-f.akamaihd.net"],
        ["aging", "76442", "http://aging-f.akamaihd.net"],
        ["approps", "76441", "http://approps-f.akamaihd.net"],
        ["armed", "76445", "http://armed-f.akamaihd.net"],
        ["banking", "76446", "http://banking-f.akamaihd.net"],
        ["budget", "76447", "http://budget-f.akamaihd.net"],
        ["cecc", "76486", "http://srs-f.akamaihd.net"],
        ["commerce", "80177", "http://commerce1-f.akamaihd.net"],
        ["csce", "75229", "http://srs-f.akamaihd.net"],
        ["dpc", "76590", "http://dpc-f.akamaihd.net"],
        ["energy", "76448", "http://energy-f.akamaihd.net"],
        ["epw", "76478", "http://epw-f.akamaihd.net"],
        ["ethics", "76449", "http://ethics-f.akamaihd.net"],
        ["finance", "76450", "http://finance-f.akamaihd.net"],
        ["foreign", "76451", "http://foreign-f.akamaihd.net"],
        ["govtaff", "76453", "http://govtaff-f.akamaihd.net"],
        ["help", "76452", "http://help-f.akamaihd.net"],
        ["indian", "76455", "http://indian-f.akamaihd.net"],
        ["intel", "76456", "http://intel-f.akamaihd.net"],
        ["intlnarc", "76457", "http://intlnarc-f.akamaihd.net"],
        ["jccic", "85180", "http://jccic-f.akamaihd.net"],
        ["jec", "76458", "http://jec-f.akamaihd.net"],
        ["judiciary", "76459", "http://judiciary-f.akamaihd.net"],
        ["rpc", "76591", "http://rpc-f.akamaihd.net"],
        ["rules", "76460", "http://rules-f.akamaihd.net"],
        ["saa", "76489", "http://srs-f.akamaihd.net"],
        ["smbiz", "76461", "http://smbiz-f.akamaihd.net"],
        ["srs", "75229", "http://srs-f.akamaihd.net"],
        ["uscc", "76487", "http://srs-f.akamaihd.net"],
        ["vetaff", "76462", "http://vetaff-f.akamaihd.net"],
        ["arch", "", "http://ussenate-f.akamaihd.net/"]
    ]
    _IE_NAME = 'senate.gov'
    _VALID_URL = r'http://www\.senate\.gov/isvp/\?(?P<qs>.+)'
    _TESTS = [{
        'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
        'md5': '7314c4b96dad66dd8e63dc3518ceaa6f',
        'info_dict': {
            'id': 'judiciary031715',
            'ext': 'flv',
            'title': 'Integrated Senate Video Player',
        }
    }, {
        'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false',
        'md5': '2917c827513700aa9b70eaebf25116da',
        'info_dict': {
            'id': 'commerce011514',
            'ext': 'flv',
            'title': 'Integrated Senate Video Player'
        }
    }, {
        'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi',
        # checksum differs each time
        'info_dict': {
            'id': 'intel090613',
            'ext': 'mp4',
            'title': 'Integrated Senate Video Player'
        }
    }]
    def _get_info_for_comm(self, committee):
        for entry in self._COMM_MAP:
            if entry[0] == committee:
                return entry[1:]
    def _real_extract(self, url):
        qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs'))
        if not qs.get('filename') or not qs.get('type') or not qs.get('comm'):
            raise ExtractorError('Invalid URL', expected=True)
        video_id = re.sub(r'.mp4$', '', qs['filename'][0])
        webpage = self._download_webpage(url, video_id)
        title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id)
        video_type = qs['type'][0]
        committee = video_type if video_type == 'arch' else qs['comm'][0]
        stream_num, domain = self._get_info_for_comm(committee)
        formats = []
        if video_type == 'arch':
            filename = video_id if '.' in video_id else video_id + '.mp4'
            formats = [{
                # All parameters in the query string are necessary to prevent a 403 error
                'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=',
            }]
        else:
            hdcore_sign = '?hdcore=3.1.0'
            url_params = (domain, video_id, stream_num)
            f4m_url = '%s/z/%s_1@%s/manifest.f4m' % url_params + hdcore_sign
            m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params
            for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'):
                # URLs without the extra param induce an 404 error
                entry.update({'extra_param_to_segment_url': hdcore_sign})
                formats.append(entry)
            for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'):
                mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url'])
                if mobj:
                    entry['format_id'] += mobj.group('tag')
                formats.append(entry)
            self._sort_formats(formats)
        info_dict = {
            'id': video_id,
            'title': title,
        }
        if len(formats) >= 1:
            info_dict.update({'formats': formats})
        else:
            info_dict.update(formats[0])
        return info_dict