[vk:wallpost] Fix audio extraction

This commit is contained in:
Sergey M․ 2016-08-18 06:14:05 +07:00
parent 08a42f9c74
commit 51815886a9
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D
1 changed files with 38 additions and 28 deletions

View File

@ -1,6 +1,7 @@
# encoding: utf-8 # encoding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import collections
import re import re
import json import json
import sys import sys
@ -16,7 +17,6 @@ from ..utils import (
get_element_by_class, get_element_by_class,
int_or_none, int_or_none,
orderedSet, orderedSet,
parse_duration,
remove_start, remove_start,
str_to_int, str_to_int,
unescapeHTML, unescapeHTML,
@ -447,6 +447,9 @@ class VKWallPostIE(VKBaseIE):
'skip_download': True, 'skip_download': True,
}, },
}], }],
'params': {
'usenetrc': True,
},
'skip': 'Requires vk account credentials', 'skip': 'Requires vk account credentials',
}, { }, {
# single YouTube embed, no leading - # single YouTube embed, no leading -
@ -456,6 +459,9 @@ class VKWallPostIE(VKBaseIE):
'title': 'Sergey Gorbunov - Wall post 85155021_6319', 'title': 'Sergey Gorbunov - Wall post 85155021_6319',
}, },
'playlist_count': 1, 'playlist_count': 1,
'params': {
'usenetrc': True,
},
'skip': 'Requires vk account credentials', 'skip': 'Requires vk account credentials',
}, { }, {
# wall page URL # wall page URL
@ -483,37 +489,41 @@ class VKWallPostIE(VKBaseIE):
raise ExtractorError('VK said: %s' % error, expected=True) raise ExtractorError('VK said: %s' % error, expected=True)
description = clean_html(get_element_by_class('wall_post_text', webpage)) description = clean_html(get_element_by_class('wall_post_text', webpage))
uploader = clean_html(get_element_by_class( uploader = clean_html(get_element_by_class('author', webpage))
'fw_post_author', webpage)) or self._og_search_description(webpage)
thumbnail = self._og_search_thumbnail(webpage) thumbnail = self._og_search_thumbnail(webpage)
entries = [] entries = []
for audio in re.finditer(r'''(?sx) audio_ids = re.findall(r'data-full-id=["\'](\d+_\d+)', webpage)
<input[^>]+ if audio_ids:
id=(?P<q1>["\'])audio_info(?P<id>\d+_\d+).*?(?P=q1)[^>]+ al_audio = self._download_webpage(
value=(?P<q2>["\'])(?P<url>http.+?)(?P=q2) 'https://vk.com/al_audio.php', post_id,
.+? note='Downloading audio info', fatal=False,
</table>''', webpage): data=urlencode_postdata({
audio_html = audio.group(0) 'act': 'reload_audio',
audio_id = audio.group('id') 'al': '1',
duration = parse_duration(get_element_by_class('duration', audio_html)) 'ids': ','.join(audio_ids)
track = self._html_search_regex( }))
r'<span[^>]+id=["\']title%s[^>]*>([^<]+)' % audio_id, if al_audio:
audio_html, 'title', default=None) Audio = collections.namedtuple(
artist = self._html_search_regex( 'Audio', ['id', 'user_id', 'url', 'track', 'artist', 'duration'])
r'>([^<]+)</a></b>\s*&ndash', audio_html, audios = self._parse_json(
'artist', default=None) self._search_regex(
entries.append({ r'<!json>(.+?)<!>', al_audio, 'audios', default='[]'),
'id': audio_id, post_id, fatal=False, transform_source=unescapeHTML)
'url': audio.group('url'), if isinstance(audios, list):
'title': '%s - %s' % (artist, track) if artist and track else audio_id, for audio in audios:
'thumbnail': thumbnail, a = Audio._make(audio[:6])
'duration': duration, entries.append({
'uploader': uploader, 'id': '%s_%s' % (a.user_id, a.id),
'artist': artist, 'url': a.url,
'track': track, 'title': '%s - %s' % (a.artist, a.track) if a.artist and a.track else a.id,
}) 'thumbnail': thumbnail,
'duration': a.duration,
'uploader': uploader,
'artist': a.artist,
'track': a.track,
})
for video in re.finditer( for video in re.finditer(
r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage): r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage):