Full youtube video descriptions, including special characters (2.6+, with fallback for older Pythons)

This commit is contained in:
Philipp Hagemeister 2011-07-07 12:12:20 +02:00
parent aded78d9e2
commit c6b55a8d48
1 changed files with 30 additions and 8 deletions

View File

@ -15,7 +15,6 @@ import email.utils
import gzip import gzip
import htmlentitydefs import htmlentitydefs
import httplib import httplib
import json # TODO: json for 2.5
import locale import locale
import math import math
import netrc import netrc
@ -24,20 +23,35 @@ import os.path
import re import re
import socket import socket
import string import string
import StringIO
import subprocess import subprocess
import sys import sys
import time import time
import urllib import urllib
import urllib2 import urllib2
import warnings
import zlib import zlib
try:
import json
except ImportError:
warnings.warn('No JSON support (TODO: insert trivialjson here)')
try:
import cStringIO as StringIO
except ImportError:
import StringIO
# parse_qs was moved from the cgi module to the urlparse module recently. # parse_qs was moved from the cgi module to the urlparse module recently.
try: try:
from urlparse import parse_qs from urlparse import parse_qs
except ImportError: except ImportError:
from cgi import parse_qs from cgi import parse_qs
try:
import lxml.etree
except ImportError: # Python < 2.6
pass # Handled below
std_headers = { std_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
@ -1068,11 +1082,19 @@ class YoutubeIE(InfoExtractor):
pass pass
# description # description
video_description = 'No description available.' try:
lxml.etree
except NameError:
video_description = u'No description available.'
if self._downloader.params.get('forcedescription', False): if self._downloader.params.get('forcedescription', False):
warnings.warn(u'You are using an old Python version, install Python 2.6+ or lxml. Falling back to old video description extractor.')
mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage) mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
if mobj is not None: if mobj is not None:
video_description = mobj.group(1) video_description = mobj.group(1).decode('utf-8')
else:
html_parser = lxml.etree.HTMLParser(encoding='utf-8')
vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
# token # token
video_token = urllib.unquote_plus(video_info['token'][0]) video_token = urllib.unquote_plus(video_info['token'][0])
@ -1130,7 +1152,7 @@ class YoutubeIE(InfoExtractor):
'ext': video_extension.decode('utf-8'), 'ext': video_extension.decode('utf-8'),
'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
'thumbnail': video_thumbnail.decode('utf-8'), 'thumbnail': video_thumbnail.decode('utf-8'),
'description': video_description.decode('utf-8'), 'description': video_description,
'player_url': player_url, 'player_url': player_url,
}) })
except UnavailableVideoError, err: except UnavailableVideoError, err: