changed spaces to tabs (by yt-dl standards), fixed bugs, but still won't download. need to figure out how the whole process works to integrate correctly

This commit is contained in:
Kevin Ngo 2011-11-10 01:04:33 -08:00
parent 073d7a5985
commit b20d4f8626
1 changed files with 47 additions and 40 deletions

View File

@ -3481,20 +3481,20 @@ class XVideosIE(InfoExtractor):
self._downloader.trouble(u'\nERROR: unable to download ' + video_id) self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
class SoundcloudIE(InformationExtractor): class SoundcloudIE(InfoExtractor):
"""Information extractor for soundcloud.com """Information extractor for soundcloud.com
To access the media, the uid of the song and a stream token To access the media, the uid of the song and a stream token
must be extracted from the page source and the script must make must be extracted from the page source and the script must make
a request to media.soundcloud.com/crossdomain.xml. Then a request to media.soundcloud.com/crossdomain.xml. Then
the media can be grabbed by requesting from an url composed the media can be grabbed by requesting from an url composed
of the stream token and uid of the stream token and uid
""" """
_VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)' _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
IE_NAME = u'soundcloud' IE_NAME = u'soundcloud'
def __init__(self, downloader=None): def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader) InfoExtractor.__init__(self, downloader)
def report_webpage(self, video_id): def report_webpage(self, video_id):
"""Report information extraction.""" """Report information extraction."""
@ -3504,8 +3504,8 @@ class SoundcloudIE(InformationExtractor):
"""Report information extraction.""" """Report information extraction."""
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
def _real_initialize(self): def _real_initialize(self):
return return
def _real_extract(self, url): def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser() htmlParser = HTMLParser.HTMLParser()
@ -3515,10 +3515,10 @@ class SoundcloudIE(InformationExtractor):
self._downloader.trouble(u'ERROR: invalid URL: %s' % url) self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
return return
# extract uploader (which is in the url) # extract uploader (which is in the url)
uploader = mobj.group(3).decode('utf-8') uploader = mobj.group(1).decode('utf-8')
# extract simple title (uploader + slug of song title) # extract simple title (uploader + slug of song title)
slug_title = mobj.group(4).decode('utf-8') slug_title = mobj.group(2).decode('utf-8')
simple_title = uploader + '-' + slug_title simple_title = uploader + '-' + slug_title
self.report_webpage('%s/%s' % (uploader, slug_title)) self.report_webpage('%s/%s' % (uploader, slug_title))
@ -3532,32 +3532,36 @@ class SoundcloudIE(InformationExtractor):
self.report_extraction('%s/%s' % (uploader, slug_title)) self.report_extraction('%s/%s' % (uploader, slug_title))
# extract uid and access token # extract uid and access token
mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', page) mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', page)
if mobj: if mobj:
video_id = match.group(1) video_id = match.group(1)
stream_token = match.group(2) stream_token = match.group(2)
# construct media url (with uid/token) to request song # construct media url (with uid/token) to request song
mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s" mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
mediaURL = mediaURL % (video_id, stream_token) mediaURL = mediaURL % (video_id, stream_token)
# description # description
description = u'No description available' description = u'No description available'
mobj = re.search('track-description-value"><p>(.*?)</p>', page) mobj = re.search('track-description-value"><p>(.*?)</p>', page)
if mobj: if mobj:
description = mobj.group(1) description = mobj.group(1)
# upload date # upload date
mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", page) mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", page)
if mobj: if mobj:
try: try:
upload_date = datetime.datetime.strptime(match.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d') upload_date = datetime.datetime.strptime(match.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
except: except:
pass pass
try: # for soundcloud, a request must be made to a cross domain to establish
self._download.process_info({ # needed cookies
request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
try:
self._downloader.process_info({
'id': video_id, 'id': video_id,
'url': video_url, 'url': video_url,
'uploader': uploader, 'uploader': uploader,
@ -3567,8 +3571,10 @@ class SoundcloudIE(InformationExtractor):
'ext': u'mp3', 'ext': u'mp3',
'format': u'NA', 'format': u'NA',
'player_url': None, 'player_url': None,
'description': description 'description': description
}) })
except UnavailableVideoError:
self._downloader.trouble(u'\nERROR: unable to download video')
class PostProcessor(object): class PostProcessor(object):
"""Post Processor class. """Post Processor class.
@ -3966,6 +3972,7 @@ def gen_extractors():
EscapistIE(), EscapistIE(),
CollegeHumorIE(), CollegeHumorIE(),
XVideosIE(), XVideosIE(),
SoundcloudIE(),
GenericIE() GenericIE()
] ]