[chirbit] Simplify and extract profile from RSS (#5032)

This commit is contained in:
Sergey M․ 2015-02-23 21:15:16 +06:00
parent 543ec2136b
commit a65d4e7f14
2 changed files with 53 additions and 65 deletions

View File

@ -63,7 +63,10 @@ from .ccc import CCCIE
from .ceskatelevize import CeskaTelevizeIE from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE from .channel9 import Channel9IE
from .chilloutzone import ChilloutzoneIE from .chilloutzone import ChilloutzoneIE
from .chirbit import ChirbitIE, ChirbitProfileIE from .chirbit import (
ChirbitIE,
ChirbitProfileIE,
)
from .cinchcast import CinchcastIE from .cinchcast import CinchcastIE
from .clipfish import ClipfishIE from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE from .cliphunter import CliphunterIE

View File

@ -1,97 +1,82 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import clean_html from ..utils import (
parse_duration,
int_or_none,
)
class ChirbitIE(InfoExtractor): class ChirbitIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?chirb\.it/(?P<id>[^/]+)' _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)'
_TEST = { _TESTS = [{
'url': 'http://chirb.it/PrIPv5', 'url': 'http://chirb.it/PrIPv5',
'md5': '9847b0dad6ac3e074568bf2cfb197de8', 'md5': '9847b0dad6ac3e074568bf2cfb197de8',
'info_dict': { 'info_dict': {
'id': 'PrIPv5', 'id': 'PrIPv5',
'display_id': 'kukushtv_1423231243',
'ext': 'mp3', 'ext': 'mp3',
'title': 'Фасадстрой', 'title': 'Фасадстрой',
'url': 'http://audio.chirbit.com/kukushtv_1423231243.mp3' 'duration': 52,
'view_count': int,
'comment_count': int,
} }
} }, {
'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
audio_linkid = self._match_id(url) audio_id = self._match_id(url)
webpage = self._download_webpage(url, audio_linkid)
audio_title = self._html_search_regex(r'<h2\s+itemprop="name">(.*?)</h2>', webpage, 'title') webpage = self._download_webpage(
audio_id = self._html_search_regex(r'\("setFile",\s+"http://audio.chirbit.com/(.*?).mp3"\)', webpage, 'audio ID') 'http://chirb.it/%s' % audio_id, audio_id)
audio_url = 'http://audio.chirbit.com/' + audio_id + '.mp3';
audio_url = self._search_regex(
r'"setFile"\s*,\s*"([^"]+)"', webpage, 'audio url')
title = self._search_regex(
r'itemprop="name">([^<]+)', webpage, 'title')
duration = parse_duration(self._html_search_meta(
'duration', webpage, 'duration', fatal=False))
view_count = int_or_none(self._search_regex(
r'itemprop="playCount"\s*>(\d+)', webpage,
'listen count', fatal=False))
comment_count = int_or_none(self._search_regex(
r'>(\d+) Comments?:', webpage,
'comment count', fatal=False))
return { return {
'id': audio_linkid, 'id': audio_id,
'display_id': audio_id, 'url': audio_url,
'title': audio_title, 'title': title,
'url': audio_url 'duration': duration,
'view_count': view_count,
'comment_count': comment_count,
} }
class ChirbitProfileIE(InfoExtractor): class ChirbitProfileIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P<id>[^/]+)/?$' _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P<id>[^/]+)'
_TEST = { _TEST = {
'url': 'http://chirbit.com/ScarletBeauty', 'url': 'http://chirbit.com/ScarletBeauty',
'playlist_count': 3,
'info_dict': { 'info_dict': {
'_type': 'playlist', 'id': 'ScarletBeauty',
'title': 'ScarletBeauty', 'title': 'Chirbits by ScarletBeauty',
'id': 'ScarletBeauty' },
} 'playlist_mincount': 3,
} }
def _real_extract(self, url): def _real_extract(self, url):
profile_id = self._match_id(url) profile_id = self._match_id(url)
# Chirbit has a pretty weird "Last Page" navigation behavior. rss = self._download_xml(
# We grab the profile's oldest entry to determine when to 'http://chirbit.com/rss/%s' % profile_id, profile_id)
# stop fetching entries.
oldestpage = self._download_webpage(url + '/24599', profile_id)
oldest_page_entries = re.findall(
r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''',
oldestpage);
oldestentry = clean_html(oldest_page_entries[-1]);
ids = [] entries = [
titles = [] self.url_result(audio_url.text, 'Chirbit')
n = 0 for audio_url in rss.findall('./channel/item/link')]
while True:
page = self._download_webpage(url + '/' + str(n), profile_id)
page_ids = re.findall(
r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''',
page);
page_titles = re.findall(
r'''<div\s+class="chirbit_title"\s*>(.*?)</div>''',
page);
ids += page_ids
titles += page_titles
if oldestentry in page_ids:
break
n += 1
entries = [] title = rss.find('./channel/title').text
i = 0
for id in ids:
entries.append({
'id': id,
'title': titles[i],
'url': 'http://audio.chirbit.com/' + id + '.mp3'
});
i += 1
info_dict = { return self.playlist_result(entries, profile_id, title)
'_type': 'playlist',
'id': profile_id,
'title': profile_id,
'entries': entries
}
return info_dict;