1 | """Universal feed parser |
---|
2 | |
---|
3 | Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds |
---|
4 | |
---|
5 | Visit https://code.google.com/p/feedparser/ for the latest version |
---|
6 | Visit http://packages.python.org/feedparser/ for the latest documentation |
---|
7 | |
---|
8 | Required: Python 2.4 or later |
---|
9 | Recommended: iconv_codec <http://cjkpython.i18n.org/> |
---|
10 | """ |
---|
11 | |
---|
12 | __version__ = "5.2.1" |
---|
13 | __license__ = """ |
---|
14 | Copyright 2010-2015 Kurt McKee <contactme@kurtmckee.org> |
---|
15 | Copyright 2002-2008 Mark Pilgrim |
---|
16 | All rights reserved. |
---|
17 | |
---|
18 | Redistribution and use in source and binary forms, with or without modification, |
---|
19 | are permitted provided that the following conditions are met: |
---|
20 | |
---|
21 | * Redistributions of source code must retain the above copyright notice, |
---|
22 | this list of conditions and the following disclaimer. |
---|
23 | * Redistributions in binary form must reproduce the above copyright notice, |
---|
24 | this list of conditions and the following disclaimer in the documentation |
---|
25 | and/or other materials provided with the distribution. |
---|
26 | |
---|
27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' |
---|
28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
---|
29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
---|
30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
---|
31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
---|
32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
---|
33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
---|
34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
---|
35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
---|
36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
---|
37 | POSSIBILITY OF SUCH DAMAGE.""" |
---|
38 | __author__ = "Mark Pilgrim <http://diveintomark.org/>" |
---|
39 | __contributors__ = ["Jason Diamond <http://injektilo.org/>", |
---|
40 | "John Beimler <http://john.beimler.org/>", |
---|
41 | "Fazal Majid <http://www.majid.info/mylos/weblog/>", |
---|
42 | "Aaron Swartz <http://aaronsw.com/>", |
---|
43 | "Kevin Marks <http://epeus.blogspot.com/>", |
---|
44 | "Sam Ruby <http://intertwingly.net/>", |
---|
45 | "Ade Oshineye <http://blog.oshineye.com/>", |
---|
46 | "Martin Pool <http://sourcefrog.net/>", |
---|
47 | "Kurt McKee <http://kurtmckee.org/>", |
---|
48 | "Bernd Schlapsi <https://github.com/brot>",] |
---|
49 | |
---|
50 | # HTTP "User-Agent" header to send to servers when downloading feeds. |
---|
51 | # If you are embedding feedparser in a larger application, you should |
---|
52 | # change this to your application name and URL. |
---|
53 | USER_AGENT = "UniversalFeedParser/%s +https://code.google.com/p/feedparser/" % __version__ |
---|
54 | |
---|
55 | # HTTP "Accept" header to send to servers when downloading feeds. If you don't |
---|
56 | # want to send an Accept header, set this to None. |
---|
57 | ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1" |
---|
58 | |
---|
59 | # List of preferred XML parsers, by SAX driver name. These will be tried first, |
---|
60 | # but if they're not installed, Python will keep searching through its own list |
---|
61 | # of pre-installed parsers until it finds one that supports everything we need. |
---|
62 | PREFERRED_XML_PARSERS = ["drv_libxml2"] |
---|
63 | |
---|
64 | # If you want feedparser to automatically resolve all relative URIs, set this |
---|
65 | # to 1. |
---|
66 | RESOLVE_RELATIVE_URIS = 1 |
---|
67 | |
---|
68 | # If you want feedparser to automatically sanitize all potentially unsafe |
---|
69 | # HTML content, set this to 1. |
---|
70 | SANITIZE_HTML = 1 |
---|
71 | |
---|
72 | # ---------- Python 3 modules (make it work if possible) ---------- |
---|
73 | try: |
---|
74 | import rfc822 |
---|
75 | except ImportError: |
---|
76 | from email import _parseaddr as rfc822 |
---|
77 | |
---|
78 | try: |
---|
79 | # Python 3.1 introduces bytes.maketrans and simultaneously |
---|
80 | # deprecates string.maketrans; use bytes.maketrans if possible |
---|
81 | _maketrans = bytes.maketrans |
---|
82 | except (NameError, AttributeError): |
---|
83 | import string |
---|
84 | _maketrans = string.maketrans |
---|
85 | |
---|
86 | # base64 support for Atom feeds that contain embedded binary data |
---|
87 | try: |
---|
88 | import base64, binascii |
---|
89 | except ImportError: |
---|
90 | base64 = binascii = None |
---|
91 | else: |
---|
92 | # Python 3.1 deprecates decodestring in favor of decodebytes |
---|
93 | _base64decode = getattr(base64, 'decodebytes', base64.decodestring) |
---|
94 | |
---|
95 | # _s2bytes: convert a UTF-8 str to bytes if the interpreter is Python 3 |
---|
96 | # _l2bytes: convert a list of ints to bytes if the interpreter is Python 3 |
---|
97 | try: |
---|
98 | if bytes is str: |
---|
99 | # In Python 2.5 and below, bytes doesn't exist (NameError) |
---|
100 | # In Python 2.6 and above, bytes and str are the same type |
---|
101 | raise NameError |
---|
102 | except NameError: |
---|
103 | # Python 2 |
---|
104 | def _s2bytes(s): |
---|
105 | return s |
---|
106 | def _l2bytes(l): |
---|
107 | return ''.join(map(chr, l)) |
---|
108 | else: |
---|
109 | # Python 3 |
---|
110 | def _s2bytes(s): |
---|
111 | return bytes(s, 'utf8') |
---|
112 | def _l2bytes(l): |
---|
113 | return bytes(l) |
---|
114 | |
---|
115 | # If you want feedparser to allow all URL schemes, set this to () |
---|
116 | # List culled from Python's urlparse documentation at: |
---|
117 | # http://docs.python.org/library/urlparse.html |
---|
118 | # as well as from "URI scheme" at Wikipedia: |
---|
119 | # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme |
---|
120 | # Many more will likely need to be added! |
---|
121 | ACCEPTABLE_URI_SCHEMES = ( |
---|
122 | 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet', |
---|
123 | 'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', |
---|
124 | 'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', |
---|
125 | 'wais', |
---|
126 | # Additional common-but-unofficial schemes |
---|
127 | 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs', |
---|
128 | 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg', |
---|
129 | ) |
---|
130 | #ACCEPTABLE_URI_SCHEMES = () |
---|
131 | |
---|
132 | # ---------- required modules (should come with any Python distribution) ---------- |
---|
133 | import cgi |
---|
134 | import codecs |
---|
135 | import copy |
---|
136 | import datetime |
---|
137 | import itertools |
---|
138 | import re |
---|
139 | import struct |
---|
140 | import time |
---|
141 | import types |
---|
142 | import urllib |
---|
143 | import urllib2 |
---|
144 | import urlparse |
---|
145 | import warnings |
---|
146 | |
---|
147 | from htmlentitydefs import name2codepoint, codepoint2name, entitydefs |
---|
148 | |
---|
149 | try: |
---|
150 | from io import BytesIO as _StringIO |
---|
151 | except ImportError: |
---|
152 | try: |
---|
153 | from cStringIO import StringIO as _StringIO |
---|
154 | except ImportError: |
---|
155 | from StringIO import StringIO as _StringIO |
---|
156 | |
---|
157 | # ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- |
---|
158 | |
---|
159 | # gzip is included with most Python distributions, but may not be available if you compiled your own |
---|
160 | try: |
---|
161 | import gzip |
---|
162 | except ImportError: |
---|
163 | gzip = None |
---|
164 | try: |
---|
165 | import zlib |
---|
166 | except ImportError: |
---|
167 | zlib = None |
---|
168 | |
---|
169 | # If a real XML parser is available, feedparser will attempt to use it. feedparser has |
---|
170 | # been tested with the built-in SAX parser and libxml2. On platforms where the |
---|
171 | # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some |
---|
172 | # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing. |
---|
173 | try: |
---|
174 | import xml.sax |
---|
175 | from xml.sax.saxutils import escape as _xmlescape |
---|
176 | except ImportError: |
---|
177 | _XML_AVAILABLE = 0 |
---|
178 | def _xmlescape(data,entities={}): |
---|
179 | data = data.replace('&', '&') |
---|
180 | data = data.replace('>', '>') |
---|
181 | data = data.replace('<', '<') |
---|
182 | for char, entity in entities: |
---|
183 | data = data.replace(char, entity) |
---|
184 | return data |
---|
185 | else: |
---|
186 | try: |
---|
187 | xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers |
---|
188 | except xml.sax.SAXReaderNotAvailable: |
---|
189 | _XML_AVAILABLE = 0 |
---|
190 | else: |
---|
191 | _XML_AVAILABLE = 1 |
---|
192 | |
---|
193 | # sgmllib is not available by default in Python 3; if the end user doesn't have |
---|
194 | # it available then we'll lose illformed XML parsing and content santizing |
---|
195 | try: |
---|
196 | import sgmllib |
---|
197 | except ImportError: |
---|
198 | # This is probably Python 3, which doesn't include sgmllib anymore |
---|
199 | _SGML_AVAILABLE = 0 |
---|
200 | |
---|
201 | # Mock sgmllib enough to allow subclassing later on |
---|
202 | class sgmllib(object): |
---|
203 | class SGMLParser(object): |
---|
204 | def goahead(self, i): |
---|
205 | pass |
---|
206 | def parse_starttag(self, i): |
---|
207 | pass |
---|
208 | else: |
---|
209 | _SGML_AVAILABLE = 1 |
---|
210 | |
---|
211 | # sgmllib defines a number of module-level regular expressions that are |
---|
212 | # insufficient for the XML parsing feedparser needs. Rather than modify |
---|
213 | # the variables directly in sgmllib, they're defined here using the same |
---|
214 | # names, and the compiled code objects of several sgmllib.SGMLParser |
---|
215 | # methods are copied into _BaseHTMLProcessor so that they execute in |
---|
216 | # feedparser's scope instead of sgmllib's scope. |
---|
217 | charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);') |
---|
218 | tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') |
---|
219 | attrfind = re.compile( |
---|
220 | r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*' |
---|
221 | r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?' |
---|
222 | ) |
---|
223 | |
---|
224 | # Unfortunately, these must be copied over to prevent NameError exceptions |
---|
225 | entityref = sgmllib.entityref |
---|
226 | incomplete = sgmllib.incomplete |
---|
227 | interesting = sgmllib.interesting |
---|
228 | shorttag = sgmllib.shorttag |
---|
229 | shorttagopen = sgmllib.shorttagopen |
---|
230 | starttagopen = sgmllib.starttagopen |
---|
231 | |
---|
232 | class _EndBracketRegEx: |
---|
233 | def __init__(self): |
---|
234 | # Overriding the built-in sgmllib.endbracket regex allows the |
---|
235 | # parser to find angle brackets embedded in element attributes. |
---|
236 | self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''') |
---|
237 | def search(self, target, index=0): |
---|
238 | match = self.endbracket.match(target, index) |
---|
239 | if match is not None: |
---|
240 | # Returning a new object in the calling thread's context |
---|
241 | # resolves a thread-safety. |
---|
242 | return EndBracketMatch(match) |
---|
243 | return None |
---|
244 | class EndBracketMatch: |
---|
245 | def __init__(self, match): |
---|
246 | self.match = match |
---|
247 | def start(self, n): |
---|
248 | return self.match.end(n) |
---|
249 | endbracket = _EndBracketRegEx() |
---|
250 | |
---|
251 | |
---|
252 | # iconv_codec provides support for more character encodings. |
---|
253 | # It's available from http://cjkpython.i18n.org/ |
---|
254 | try: |
---|
255 | import iconv_codec |
---|
256 | except ImportError: |
---|
257 | pass |
---|
258 | |
---|
259 | # chardet library auto-detects character encodings |
---|
260 | # Download from http://chardet.feedparser.org/ |
---|
261 | try: |
---|
262 | import chardet |
---|
263 | except ImportError: |
---|
264 | chardet = None |
---|
265 | |
---|
266 | # ---------- don't touch these ---------- |
---|
267 | class ThingsNobodyCaresAboutButMe(Exception): pass |
---|
268 | class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass |
---|
269 | class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass |
---|
270 | class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass |
---|
271 | class UndeclaredNamespace(Exception): pass |
---|
272 | |
---|
273 | SUPPORTED_VERSIONS = {'': u'unknown', |
---|
274 | 'rss090': u'RSS 0.90', |
---|
275 | 'rss091n': u'RSS 0.91 (Netscape)', |
---|
276 | 'rss091u': u'RSS 0.91 (Userland)', |
---|
277 | 'rss092': u'RSS 0.92', |
---|
278 | 'rss093': u'RSS 0.93', |
---|
279 | 'rss094': u'RSS 0.94', |
---|
280 | 'rss20': u'RSS 2.0', |
---|
281 | 'rss10': u'RSS 1.0', |
---|
282 | 'rss': u'RSS (unknown version)', |
---|
283 | 'atom01': u'Atom 0.1', |
---|
284 | 'atom02': u'Atom 0.2', |
---|
285 | 'atom03': u'Atom 0.3', |
---|
286 | 'atom10': u'Atom 1.0', |
---|
287 | 'atom': u'Atom (unknown version)', |
---|
288 | 'cdf': u'CDF', |
---|
289 | } |
---|
290 | |
---|
291 | class FeedParserDict(dict): |
---|
292 | keymap = {'channel': 'feed', |
---|
293 | 'items': 'entries', |
---|
294 | 'guid': 'id', |
---|
295 | 'date': 'updated', |
---|
296 | 'date_parsed': 'updated_parsed', |
---|
297 | 'description': ['summary', 'subtitle'], |
---|
298 | 'description_detail': ['summary_detail', 'subtitle_detail'], |
---|
299 | 'url': ['href'], |
---|
300 | 'modified': 'updated', |
---|
301 | 'modified_parsed': 'updated_parsed', |
---|
302 | 'issued': 'published', |
---|
303 | 'issued_parsed': 'published_parsed', |
---|
304 | 'copyright': 'rights', |
---|
305 | 'copyright_detail': 'rights_detail', |
---|
306 | 'tagline': 'subtitle', |
---|
307 | 'tagline_detail': 'subtitle_detail'} |
---|
308 | def __getitem__(self, key): |
---|
309 | ''' |
---|
310 | :return: A :class:`FeedParserDict`. |
---|
311 | ''' |
---|
312 | if key == 'category': |
---|
313 | try: |
---|
314 | return dict.__getitem__(self, 'tags')[0]['term'] |
---|
315 | except IndexError: |
---|
316 | raise KeyError, "object doesn't have key 'category'" |
---|
317 | elif key == 'enclosures': |
---|
318 | norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel']) |
---|
319 | return [norel(link) for link in dict.__getitem__(self, 'links') if link['rel']==u'enclosure'] |
---|
320 | elif key == 'license': |
---|
321 | for link in dict.__getitem__(self, 'links'): |
---|
322 | if link['rel']==u'license' and 'href' in link: |
---|
323 | return link['href'] |
---|
324 | elif key == 'updated': |
---|
325 | # Temporarily help developers out by keeping the old |
---|
326 | # broken behavior that was reported in issue 310. |
---|
327 | # This fix was proposed in issue 328. |
---|
328 | if not dict.__contains__(self, 'updated') and \ |
---|
329 | dict.__contains__(self, 'published'): |
---|
330 | warnings.warn("To avoid breaking existing software while " |
---|
331 | "fixing issue 310, a temporary mapping has been created " |
---|
332 | "from `updated` to `published` if `updated` doesn't " |
---|
333 | "exist. This fallback will be removed in a future version " |
---|
334 | "of feedparser.", DeprecationWarning) |
---|
335 | return dict.__getitem__(self, 'published') |
---|
336 | return dict.__getitem__(self, 'updated') |
---|
337 | elif key == 'updated_parsed': |
---|
338 | if not dict.__contains__(self, 'updated_parsed') and \ |
---|
339 | dict.__contains__(self, 'published_parsed'): |
---|
340 | warnings.warn("To avoid breaking existing software while " |
---|
341 | "fixing issue 310, a temporary mapping has been created " |
---|
342 | "from `updated_parsed` to `published_parsed` if " |
---|
343 | "`updated_parsed` doesn't exist. This fallback will be " |
---|
344 | "removed in a future version of feedparser.", |
---|
345 | DeprecationWarning) |
---|
346 | return dict.__getitem__(self, 'published_parsed') |
---|
347 | return dict.__getitem__(self, 'updated_parsed') |
---|
348 | else: |
---|
349 | realkey = self.keymap.get(key, key) |
---|
350 | if isinstance(realkey, list): |
---|
351 | for k in realkey: |
---|
352 | if dict.__contains__(self, k): |
---|
353 | return dict.__getitem__(self, k) |
---|
354 | elif dict.__contains__(self, realkey): |
---|
355 | return dict.__getitem__(self, realkey) |
---|
356 | return dict.__getitem__(self, key) |
---|
357 | |
---|
358 | def __contains__(self, key): |
---|
359 | if key in ('updated', 'updated_parsed'): |
---|
360 | # Temporarily help developers out by keeping the old |
---|
361 | # broken behavior that was reported in issue 310. |
---|
362 | # This fix was proposed in issue 328. |
---|
363 | return dict.__contains__(self, key) |
---|
364 | try: |
---|
365 | self.__getitem__(key) |
---|
366 | except KeyError: |
---|
367 | return False |
---|
368 | else: |
---|
369 | return True |
---|
370 | |
---|
371 | has_key = __contains__ |
---|
372 | |
---|
373 | def get(self, key, default=None): |
---|
374 | ''' |
---|
375 | :return: A :class:`FeedParserDict`. |
---|
376 | ''' |
---|
377 | try: |
---|
378 | return self.__getitem__(key) |
---|
379 | except KeyError: |
---|
380 | return default |
---|
381 | |
---|
382 | def __setitem__(self, key, value): |
---|
383 | key = self.keymap.get(key, key) |
---|
384 | if isinstance(key, list): |
---|
385 | key = key[0] |
---|
386 | return dict.__setitem__(self, key, value) |
---|
387 | |
---|
388 | def setdefault(self, key, value): |
---|
389 | if key not in self: |
---|
390 | self[key] = value |
---|
391 | return value |
---|
392 | return self[key] |
---|
393 | |
---|
394 | def __getattr__(self, key): |
---|
395 | # __getattribute__() is called first; this will be called |
---|
396 | # only if an attribute was not already found |
---|
397 | try: |
---|
398 | return self.__getitem__(key) |
---|
399 | except KeyError: |
---|
400 | raise AttributeError, "object has no attribute '%s'" % key |
---|
401 | |
---|
402 | def __hash__(self): |
---|
403 | return id(self) |
---|
404 | |
---|
405 | _cp1252 = { |
---|
406 | 128: unichr(8364), # euro sign |
---|
407 | 130: unichr(8218), # single low-9 quotation mark |
---|
408 | 131: unichr( 402), # latin small letter f with hook |
---|
409 | 132: unichr(8222), # double low-9 quotation mark |
---|
410 | 133: unichr(8230), # horizontal ellipsis |
---|
411 | 134: unichr(8224), # dagger |
---|
412 | 135: unichr(8225), # double dagger |
---|
413 | 136: unichr( 710), # modifier letter circumflex accent |
---|
414 | 137: unichr(8240), # per mille sign |
---|
415 | 138: unichr( 352), # latin capital letter s with caron |
---|
416 | 139: unichr(8249), # single left-pointing angle quotation mark |
---|
417 | 140: unichr( 338), # latin capital ligature oe |
---|
418 | 142: unichr( 381), # latin capital letter z with caron |
---|
419 | 145: unichr(8216), # left single quotation mark |
---|
420 | 146: unichr(8217), # right single quotation mark |
---|
421 | 147: unichr(8220), # left double quotation mark |
---|
422 | 148: unichr(8221), # right double quotation mark |
---|
423 | 149: unichr(8226), # bullet |
---|
424 | 150: unichr(8211), # en dash |
---|
425 | 151: unichr(8212), # em dash |
---|
426 | 152: unichr( 732), # small tilde |
---|
427 | 153: unichr(8482), # trade mark sign |
---|
428 | 154: unichr( 353), # latin small letter s with caron |
---|
429 | 155: unichr(8250), # single right-pointing angle quotation mark |
---|
430 | 156: unichr( 339), # latin small ligature oe |
---|
431 | 158: unichr( 382), # latin small letter z with caron |
---|
432 | 159: unichr( 376), # latin capital letter y with diaeresis |
---|
433 | } |
---|
434 | |
---|
435 | _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') |
---|
436 | def _urljoin(base, uri): |
---|
437 | uri = _urifixer.sub(r'\1\3', uri) |
---|
438 | if not isinstance(uri, unicode): |
---|
439 | uri = uri.decode('utf-8', 'ignore') |
---|
440 | try: |
---|
441 | uri = urlparse.urljoin(base, uri) |
---|
442 | except ValueError: |
---|
443 | uri = u'' |
---|
444 | if not isinstance(uri, unicode): |
---|
445 | return uri.decode('utf-8', 'ignore') |
---|
446 | return uri |
---|
447 | |
---|
448 | class _FeedParserMixin: |
---|
449 | namespaces = { |
---|
450 | '': '', |
---|
451 | 'http://backend.userland.com/rss': '', |
---|
452 | 'http://blogs.law.harvard.edu/tech/rss': '', |
---|
453 | 'http://purl.org/rss/1.0/': '', |
---|
454 | 'http://my.netscape.com/rdf/simple/0.9/': '', |
---|
455 | 'http://example.com/newformat#': '', |
---|
456 | 'http://example.com/necho': '', |
---|
457 | 'http://purl.org/echo/': '', |
---|
458 | 'uri/of/echo/namespace#': '', |
---|
459 | 'http://purl.org/pie/': '', |
---|
460 | 'http://purl.org/atom/ns#': '', |
---|
461 | 'http://www.w3.org/2005/Atom': '', |
---|
462 | 'http://purl.org/rss/1.0/modules/rss091#': '', |
---|
463 | |
---|
464 | 'http://webns.net/mvcb/': 'admin', |
---|
465 | 'http://purl.org/rss/1.0/modules/aggregation/': 'ag', |
---|
466 | 'http://purl.org/rss/1.0/modules/annotate/': 'annotate', |
---|
467 | 'http://media.tangent.org/rss/1.0/': 'audio', |
---|
468 | 'http://backend.userland.com/blogChannelModule': 'blogChannel', |
---|
469 | 'http://web.resource.org/cc/': 'cc', |
---|
470 | 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons', |
---|
471 | 'http://purl.org/rss/1.0/modules/company': 'co', |
---|
472 | 'http://purl.org/rss/1.0/modules/content/': 'content', |
---|
473 | 'http://my.theinfo.org/changed/1.0/rss/': 'cp', |
---|
474 | 'http://purl.org/dc/elements/1.1/': 'dc', |
---|
475 | 'http://purl.org/dc/terms/': 'dcterms', |
---|
476 | 'http://purl.org/rss/1.0/modules/email/': 'email', |
---|
477 | 'http://purl.org/rss/1.0/modules/event/': 'ev', |
---|
478 | 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner', |
---|
479 | 'http://freshmeat.net/rss/fm/': 'fm', |
---|
480 | 'http://xmlns.com/foaf/0.1/': 'foaf', |
---|
481 | 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', |
---|
482 | 'http://www.georss.org/georss': 'georss', |
---|
483 | 'http://www.opengis.net/gml': 'gml', |
---|
484 | 'http://postneo.com/icbm/': 'icbm', |
---|
485 | 'http://purl.org/rss/1.0/modules/image/': 'image', |
---|
486 | 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', |
---|
487 | 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', |
---|
488 | 'http://purl.org/rss/1.0/modules/link/': 'l', |
---|
489 | 'http://search.yahoo.com/mrss': 'media', |
---|
490 | # Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace |
---|
491 | 'http://search.yahoo.com/mrss/': 'media', |
---|
492 | 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', |
---|
493 | 'http://prismstandard.org/namespaces/1.2/basic/': 'prism', |
---|
494 | 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', |
---|
495 | 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs', |
---|
496 | 'http://purl.org/rss/1.0/modules/reference/': 'ref', |
---|
497 | 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv', |
---|
498 | 'http://purl.org/rss/1.0/modules/search/': 'search', |
---|
499 | 'http://purl.org/rss/1.0/modules/slash/': 'slash', |
---|
500 | 'http://schemas.xmlsoap.org/soap/envelope/': 'soap', |
---|
501 | 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss', |
---|
502 | 'http://hacks.benhammersley.com/rss/streaming/': 'str', |
---|
503 | 'http://purl.org/rss/1.0/modules/subscription/': 'sub', |
---|
504 | 'http://purl.org/rss/1.0/modules/syndication/': 'sy', |
---|
505 | 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf', |
---|
506 | 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo', |
---|
507 | 'http://purl.org/rss/1.0/modules/threading/': 'thr', |
---|
508 | 'http://purl.org/rss/1.0/modules/textinput/': 'ti', |
---|
509 | 'http://madskills.com/public/xml/rss/module/trackback/': 'trackback', |
---|
510 | 'http://wellformedweb.org/commentAPI/': 'wfw', |
---|
511 | 'http://purl.org/rss/1.0/modules/wiki/': 'wiki', |
---|
512 | 'http://www.w3.org/1999/xhtml': 'xhtml', |
---|
513 | 'http://www.w3.org/1999/xlink': 'xlink', |
---|
514 | 'http://www.w3.org/XML/1998/namespace': 'xml', |
---|
515 | 'http://podlove.org/simple-chapters': 'psc', |
---|
516 | } |
---|
517 | _matchnamespaces = {} |
---|
518 | |
---|
519 | can_be_relative_uri = set(['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']) |
---|
520 | can_contain_relative_uris = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']) |
---|
521 | can_contain_dangerous_markup = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']) |
---|
522 | html_types = [u'text/html', u'application/xhtml+xml'] |
---|
523 | |
---|
524 | def __init__(self, baseuri=None, baselang=None, encoding=u'utf-8'): |
---|
525 | if not self._matchnamespaces: |
---|
526 | for k, v in self.namespaces.items(): |
---|
527 | self._matchnamespaces[k.lower()] = v |
---|
528 | self.feeddata = FeedParserDict() # feed-level data |
---|
529 | self.encoding = encoding # character encoding |
---|
530 | self.entries = [] # list of entry-level data |
---|
531 | self.version = u'' # feed type/version, see SUPPORTED_VERSIONS |
---|
532 | self.namespacesInUse = {} # dictionary of namespaces defined by the feed |
---|
533 | |
---|
534 | # the following are used internally to track state; |
---|
535 | # this is really out of control and should be refactored |
---|
536 | self.infeed = 0 |
---|
537 | self.inentry = 0 |
---|
538 | self.incontent = 0 |
---|
539 | self.intextinput = 0 |
---|
540 | self.inimage = 0 |
---|
541 | self.inauthor = 0 |
---|
542 | self.incontributor = 0 |
---|
543 | self.inpublisher = 0 |
---|
544 | self.insource = 0 |
---|
545 | |
---|
546 | # georss |
---|
547 | self.ingeometry = 0 |
---|
548 | |
---|
549 | self.sourcedata = FeedParserDict() |
---|
550 | self.contentparams = FeedParserDict() |
---|
551 | self._summaryKey = None |
---|
552 | self.namespacemap = {} |
---|
553 | self.elementstack = [] |
---|
554 | self.basestack = [] |
---|
555 | self.langstack = [] |
---|
556 | self.baseuri = baseuri or u'' |
---|
557 | self.lang = baselang or None |
---|
558 | self.svgOK = 0 |
---|
559 | self.title_depth = -1 |
---|
560 | self.depth = 0 |
---|
561 | # psc_chapters_flag prevents multiple psc_chapters from being |
---|
562 | # captured in a single entry or item. The transition states are |
---|
563 | # None -> True -> False. psc_chapter elements will only be |
---|
564 | # captured while it is True. |
---|
565 | self.psc_chapters_flag = None |
---|
566 | if baselang: |
---|
567 | self.feeddata['language'] = baselang.replace('_','-') |
---|
568 | |
---|
569 | # A map of the following form: |
---|
570 | # { |
---|
571 | # object_that_value_is_set_on: { |
---|
572 | # property_name: depth_of_node_property_was_extracted_from, |
---|
573 | # other_property: depth_of_node_property_was_extracted_from, |
---|
574 | # }, |
---|
575 | # } |
---|
576 | self.property_depth_map = {} |
---|
577 | |
---|
578 | def _normalize_attributes(self, kv): |
---|
579 | k = kv[0].lower() |
---|
580 | v = k in ('rel', 'type') and kv[1].lower() or kv[1] |
---|
581 | # the sgml parser doesn't handle entities in attributes, nor |
---|
582 | # does it pass the attribute values through as unicode, while |
---|
583 | # strict xml parsers do -- account for this difference |
---|
584 | if isinstance(self, _LooseFeedParser): |
---|
585 | v = v.replace('&', '&') |
---|
586 | if not isinstance(v, unicode): |
---|
587 | v = v.decode('utf-8') |
---|
588 | return (k, v) |
---|
589 | |
---|
590 | def unknown_starttag(self, tag, attrs): |
---|
591 | # increment depth counter |
---|
592 | self.depth += 1 |
---|
593 | |
---|
594 | # normalize attrs |
---|
595 | attrs = map(self._normalize_attributes, attrs) |
---|
596 | |
---|
597 | # track xml:base and xml:lang |
---|
598 | attrsD = dict(attrs) |
---|
599 | baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri |
---|
600 | if not isinstance(baseuri, unicode): |
---|
601 | baseuri = baseuri.decode(self.encoding, 'ignore') |
---|
602 | # ensure that self.baseuri is always an absolute URI that |
---|
603 | # uses a whitelisted URI scheme (e.g. not `javscript:`) |
---|
604 | if self.baseuri: |
---|
605 | self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri |
---|
606 | else: |
---|
607 | self.baseuri = _urljoin(self.baseuri, baseuri) |
---|
608 | lang = attrsD.get('xml:lang', attrsD.get('lang')) |
---|
609 | if lang == '': |
---|
610 | # xml:lang could be explicitly set to '', we need to capture that |
---|
611 | lang = None |
---|
612 | elif lang is None: |
---|
613 | # if no xml:lang is specified, use parent lang |
---|
614 | lang = self.lang |
---|
615 | if lang: |
---|
616 | if tag in ('feed', 'rss', 'rdf:RDF'): |
---|
617 | self.feeddata['language'] = lang.replace('_','-') |
---|
618 | self.lang = lang |
---|
619 | self.basestack.append(self.baseuri) |
---|
620 | self.langstack.append(lang) |
---|
621 | |
---|
622 | # track namespaces |
---|
623 | for prefix, uri in attrs: |
---|
624 | if prefix.startswith('xmlns:'): |
---|
625 | self.trackNamespace(prefix[6:], uri) |
---|
626 | elif prefix == 'xmlns': |
---|
627 | self.trackNamespace(None, uri) |
---|
628 | |
---|
629 | # track inline content |
---|
630 | if self.incontent and not self.contentparams.get('type', u'xml').endswith(u'xml'): |
---|
631 | if tag in ('xhtml:div', 'div'): |
---|
632 | return # typepad does this 10/2007 |
---|
633 | # element declared itself as escaped markup, but it isn't really |
---|
634 | self.contentparams['type'] = u'application/xhtml+xml' |
---|
635 | if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml': |
---|
636 | if tag.find(':') <> -1: |
---|
637 | prefix, tag = tag.split(':', 1) |
---|
638 | namespace = self.namespacesInUse.get(prefix, '') |
---|
639 | if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML': |
---|
640 | attrs.append(('xmlns',namespace)) |
---|
641 | if tag=='svg' and namespace=='http://www.w3.org/2000/svg': |
---|
642 | attrs.append(('xmlns',namespace)) |
---|
643 | if tag == 'svg': |
---|
644 | self.svgOK += 1 |
---|
645 | return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0) |
---|
646 | |
---|
647 | # match namespaces |
---|
648 | if tag.find(':') <> -1: |
---|
649 | prefix, suffix = tag.split(':', 1) |
---|
650 | else: |
---|
651 | prefix, suffix = '', tag |
---|
652 | prefix = self.namespacemap.get(prefix, prefix) |
---|
653 | if prefix: |
---|
654 | prefix = prefix + '_' |
---|
655 | |
---|
656 | # special hack for better tracking of empty textinput/image elements in illformed feeds |
---|
657 | if (not prefix) and tag not in ('title', 'link', 'description', 'name'): |
---|
658 | self.intextinput = 0 |
---|
659 | if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'): |
---|
660 | self.inimage = 0 |
---|
661 | |
---|
662 | # call special handler (if defined) or default handler |
---|
663 | methodname = '_start_' + prefix + suffix |
---|
664 | try: |
---|
665 | method = getattr(self, methodname) |
---|
666 | return method(attrsD) |
---|
667 | except AttributeError: |
---|
668 | # Since there's no handler or something has gone wrong we explicitly add the element and its attributes |
---|
669 | unknown_tag = prefix + suffix |
---|
670 | if len(attrsD) == 0: |
---|
671 | # No attributes so merge it into the encosing dictionary |
---|
672 | return self.push(unknown_tag, 1) |
---|
673 | else: |
---|
674 | # Has attributes so create it in its own dictionary |
---|
675 | context = self._getContext() |
---|
676 | context[unknown_tag] = attrsD |
---|
677 | |
---|
678 | def unknown_endtag(self, tag): |
---|
679 | # match namespaces |
---|
680 | if tag.find(':') <> -1: |
---|
681 | prefix, suffix = tag.split(':', 1) |
---|
682 | else: |
---|
683 | prefix, suffix = '', tag |
---|
684 | prefix = self.namespacemap.get(prefix, prefix) |
---|
685 | if prefix: |
---|
686 | prefix = prefix + '_' |
---|
687 | if suffix == 'svg' and self.svgOK: |
---|
688 | self.svgOK -= 1 |
---|
689 | |
---|
690 | # call special handler (if defined) or default handler |
---|
691 | methodname = '_end_' + prefix + suffix |
---|
692 | try: |
---|
693 | if self.svgOK: |
---|
694 | raise AttributeError() |
---|
695 | method = getattr(self, methodname) |
---|
696 | method() |
---|
697 | except AttributeError: |
---|
698 | self.pop(prefix + suffix) |
---|
699 | |
---|
700 | # track inline content |
---|
701 | if self.incontent and not self.contentparams.get('type', u'xml').endswith(u'xml'): |
---|
702 | # element declared itself as escaped markup, but it isn't really |
---|
703 | if tag in ('xhtml:div', 'div'): |
---|
704 | return # typepad does this 10/2007 |
---|
705 | self.contentparams['type'] = u'application/xhtml+xml' |
---|
706 | if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml': |
---|
707 | tag = tag.split(':')[-1] |
---|
708 | self.handle_data('</%s>' % tag, escape=0) |
---|
709 | |
---|
710 | # track xml:base and xml:lang going out of scope |
---|
711 | if self.basestack: |
---|
712 | self.basestack.pop() |
---|
713 | if self.basestack and self.basestack[-1]: |
---|
714 | self.baseuri = self.basestack[-1] |
---|
715 | if self.langstack: |
---|
716 | self.langstack.pop() |
---|
717 | if self.langstack: # and (self.langstack[-1] is not None): |
---|
718 | self.lang = self.langstack[-1] |
---|
719 | |
---|
720 | self.depth -= 1 |
---|
721 | |
---|
722 | def handle_charref(self, ref): |
---|
723 | # called for each character reference, e.g. for ' ', ref will be '160' |
---|
724 | if not self.elementstack: |
---|
725 | return |
---|
726 | ref = ref.lower() |
---|
727 | if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'): |
---|
728 | text = '&#%s;' % ref |
---|
729 | else: |
---|
730 | if ref[0] == 'x': |
---|
731 | c = int(ref[1:], 16) |
---|
732 | else: |
---|
733 | c = int(ref) |
---|
734 | text = unichr(c).encode('utf-8') |
---|
735 | self.elementstack[-1][2].append(text) |
---|
736 | |
---|
737 | def handle_entityref(self, ref): |
---|
738 | # called for each entity reference, e.g. for '©', ref will be 'copy' |
---|
739 | if not self.elementstack: |
---|
740 | return |
---|
741 | if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): |
---|
742 | text = '&%s;' % ref |
---|
743 | elif ref in self.entities: |
---|
744 | text = self.entities[ref] |
---|
745 | if text.startswith('&#') and text.endswith(';'): |
---|
746 | return self.handle_entityref(text) |
---|
747 | else: |
---|
748 | try: |
---|
749 | name2codepoint[ref] |
---|
750 | except KeyError: |
---|
751 | text = '&%s;' % ref |
---|
752 | else: |
---|
753 | text = unichr(name2codepoint[ref]).encode('utf-8') |
---|
754 | self.elementstack[-1][2].append(text) |
---|
755 | |
---|
756 | def handle_data(self, text, escape=1): |
---|
757 | # called for each block of plain text, i.e. outside of any tag and |
---|
758 | # not containing any character or entity references |
---|
759 | if not self.elementstack: |
---|
760 | return |
---|
761 | if escape and self.contentparams.get('type') == u'application/xhtml+xml': |
---|
762 | text = _xmlescape(text) |
---|
763 | self.elementstack[-1][2].append(text) |
---|
764 | |
---|
765 | def handle_comment(self, text): |
---|
766 | # called for each comment, e.g. <!-- insert message here --> |
---|
767 | pass |
---|
768 | |
---|
769 | def handle_pi(self, text): |
---|
770 | # called for each processing instruction, e.g. <?instruction> |
---|
771 | pass |
---|
772 | |
---|
773 | def handle_decl(self, text): |
---|
774 | pass |
---|
775 | |
---|
776 | def parse_declaration(self, i): |
---|
777 | # override internal declaration handler to handle CDATA blocks |
---|
778 | if self.rawdata[i:i+9] == '<![CDATA[': |
---|
779 | k = self.rawdata.find(']]>', i) |
---|
780 | if k == -1: |
---|
781 | # CDATA block began but didn't finish |
---|
782 | k = len(self.rawdata) |
---|
783 | return k |
---|
784 | self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) |
---|
785 | return k+3 |
---|
786 | else: |
---|
787 | k = self.rawdata.find('>', i) |
---|
788 | if k >= 0: |
---|
789 | return k+1 |
---|
790 | else: |
---|
791 | # We have an incomplete CDATA block. |
---|
792 | return k |
---|
793 | |
---|
794 | def mapContentType(self, contentType): |
---|
795 | contentType = contentType.lower() |
---|
796 | if contentType == 'text' or contentType == 'plain': |
---|
797 | contentType = u'text/plain' |
---|
798 | elif contentType == 'html': |
---|
799 | contentType = u'text/html' |
---|
800 | elif contentType == 'xhtml': |
---|
801 | contentType = u'application/xhtml+xml' |
---|
802 | return contentType |
---|
803 | |
---|
804 | def trackNamespace(self, prefix, uri): |
---|
805 | loweruri = uri.lower() |
---|
806 | if not self.version: |
---|
807 | if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/'): |
---|
808 | self.version = u'rss090' |
---|
809 | elif loweruri == 'http://purl.org/rss/1.0/': |
---|
810 | self.version = u'rss10' |
---|
811 | elif loweruri == 'http://www.w3.org/2005/atom': |
---|
812 | self.version = u'atom10' |
---|
813 | if loweruri.find(u'backend.userland.com/rss') <> -1: |
---|
814 | # match any backend.userland.com namespace |
---|
815 | uri = u'http://backend.userland.com/rss' |
---|
816 | loweruri = uri |
---|
817 | if loweruri in self._matchnamespaces: |
---|
818 | self.namespacemap[prefix] = self._matchnamespaces[loweruri] |
---|
819 | self.namespacesInUse[self._matchnamespaces[loweruri]] = uri |
---|
820 | else: |
---|
821 | self.namespacesInUse[prefix or ''] = uri |
---|
822 | |
---|
823 | def resolveURI(self, uri): |
---|
824 | return _urljoin(self.baseuri or u'', uri) |
---|
825 | |
---|
826 | def decodeEntities(self, element, data): |
---|
827 | return data |
---|
828 | |
---|
829 | def strattrs(self, attrs): |
---|
830 | return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs]) |
---|
831 | |
---|
832 | def push(self, element, expectingText): |
---|
833 | self.elementstack.append([element, expectingText, []]) |
---|
834 | |
---|
835 | def pop(self, element, stripWhitespace=1): |
---|
836 | if not self.elementstack: |
---|
837 | return |
---|
838 | if self.elementstack[-1][0] != element: |
---|
839 | return |
---|
840 | |
---|
841 | element, expectingText, pieces = self.elementstack.pop() |
---|
842 | |
---|
843 | if self.version == u'atom10' and self.contentparams.get('type', u'text') == u'application/xhtml+xml': |
---|
844 | # remove enclosing child element, but only if it is a <div> and |
---|
845 | # only if all the remaining content is nested underneath it. |
---|
846 | # This means that the divs would be retained in the following: |
---|
847 | # <div>foo</div><div>bar</div> |
---|
848 | while pieces and len(pieces)>1 and not pieces[-1].strip(): |
---|
849 | del pieces[-1] |
---|
850 | while pieces and len(pieces)>1 and not pieces[0].strip(): |
---|
851 | del pieces[0] |
---|
852 | if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>': |
---|
853 | depth = 0 |
---|
854 | for piece in pieces[:-1]: |
---|
855 | if piece.startswith('</'): |
---|
856 | depth -= 1 |
---|
857 | if depth == 0: |
---|
858 | break |
---|
859 | elif piece.startswith('<') and not piece.endswith('/>'): |
---|
860 | depth += 1 |
---|
861 | else: |
---|
862 | pieces = pieces[1:-1] |
---|
863 | |
---|
864 | # Ensure each piece is a str for Python 3 |
---|
865 | for (i, v) in enumerate(pieces): |
---|
866 | if not isinstance(v, unicode): |
---|
867 | pieces[i] = v.decode('utf-8') |
---|
868 | |
---|
869 | output = u''.join(pieces) |
---|
870 | if stripWhitespace: |
---|
871 | output = output.strip() |
---|
872 | if not expectingText: |
---|
873 | return output |
---|
874 | |
---|
875 | # decode base64 content |
---|
876 | if base64 and self.contentparams.get('base64', 0): |
---|
877 | try: |
---|
878 | output = _base64decode(output) |
---|
879 | except binascii.Error: |
---|
880 | pass |
---|
881 | except binascii.Incomplete: |
---|
882 | pass |
---|
883 | except TypeError: |
---|
884 | # In Python 3, base64 takes and outputs bytes, not str |
---|
885 | # This may not be the most correct way to accomplish this |
---|
886 | output = _base64decode(output.encode('utf-8')).decode('utf-8') |
---|
887 | |
---|
888 | # resolve relative URIs |
---|
889 | if (element in self.can_be_relative_uri) and output: |
---|
890 | # do not resolve guid elements with isPermalink="false" |
---|
891 | if not element == 'id' or self.guidislink: |
---|
892 | output = self.resolveURI(output) |
---|
893 | |
---|
894 | # decode entities within embedded markup |
---|
895 | if not self.contentparams.get('base64', 0): |
---|
896 | output = self.decodeEntities(element, output) |
---|
897 | |
---|
898 | # some feed formats require consumers to guess |
---|
899 | # whether the content is html or plain text |
---|
900 | if not self.version.startswith(u'atom') and self.contentparams.get('type') == u'text/plain': |
---|
901 | if self.lookslikehtml(output): |
---|
902 | self.contentparams['type'] = u'text/html' |
---|
903 | |
---|
904 | # remove temporary cruft from contentparams |
---|
905 | try: |
---|
906 | del self.contentparams['mode'] |
---|
907 | except KeyError: |
---|
908 | pass |
---|
909 | try: |
---|
910 | del self.contentparams['base64'] |
---|
911 | except KeyError: |
---|
912 | pass |
---|
913 | |
---|
914 | is_htmlish = self.mapContentType(self.contentparams.get('type', u'text/html')) in self.html_types |
---|
915 | # resolve relative URIs within embedded markup |
---|
916 | if is_htmlish and RESOLVE_RELATIVE_URIS: |
---|
917 | if element in self.can_contain_relative_uris: |
---|
918 | output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', u'text/html')) |
---|
919 | |
---|
920 | # sanitize embedded markup |
---|
921 | if is_htmlish and SANITIZE_HTML: |
---|
922 | if element in self.can_contain_dangerous_markup: |
---|
923 | output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', u'text/html')) |
---|
924 | |
---|
925 | if self.encoding and not isinstance(output, unicode): |
---|
926 | output = output.decode(self.encoding, 'ignore') |
---|
927 | |
---|
928 | # address common error where people take data that is already |
---|
929 | # utf-8, presume that it is iso-8859-1, and re-encode it. |
---|
930 | if self.encoding in (u'utf-8', u'utf-8_INVALID_PYTHON_3') and isinstance(output, unicode): |
---|
931 | try: |
---|
932 | output = output.encode('iso-8859-1').decode('utf-8') |
---|
933 | except (UnicodeEncodeError, UnicodeDecodeError): |
---|
934 | pass |
---|
935 | |
---|
936 | # map win-1252 extensions to the proper code points |
---|
937 | if isinstance(output, unicode): |
---|
938 | output = output.translate(_cp1252) |
---|
939 | |
---|
940 | # categories/tags/keywords/whatever are handled in _end_category or _end_tags or _end_itunes_keywords |
---|
941 | if element in ('category', 'tags', 'itunes_keywords'): |
---|
942 | return output |
---|
943 | |
---|
944 | if element == 'title' and -1 < self.title_depth <= self.depth: |
---|
945 | return output |
---|
946 | |
---|
947 | # store output in appropriate place(s) |
---|
948 | if self.inentry and not self.insource: |
---|
949 | if element == 'content': |
---|
950 | self.entries[-1].setdefault(element, []) |
---|
951 | contentparams = copy.deepcopy(self.contentparams) |
---|
952 | contentparams['value'] = output |
---|
953 | self.entries[-1][element].append(contentparams) |
---|
954 | elif element == 'link': |
---|
955 | if not self.inimage: |
---|
956 | # query variables in urls in link elements are improperly |
---|
957 | # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're |
---|
958 | # unhandled character references. fix this special case. |
---|
959 | output = output.replace('&', '&') |
---|
960 | output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output) |
---|
961 | self.entries[-1][element] = output |
---|
962 | if output: |
---|
963 | self.entries[-1]['links'][-1]['href'] = output |
---|
964 | else: |
---|
965 | if element == 'description': |
---|
966 | element = 'summary' |
---|
967 | old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element) |
---|
968 | if old_value_depth is None or self.depth <= old_value_depth: |
---|
969 | self.property_depth_map[self.entries[-1]][element] = self.depth |
---|
970 | self.entries[-1][element] = output |
---|
971 | if self.incontent: |
---|
972 | contentparams = copy.deepcopy(self.contentparams) |
---|
973 | contentparams['value'] = output |
---|
974 | self.entries[-1][element + '_detail'] = contentparams |
---|
975 | elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage): |
---|
976 | context = self._getContext() |
---|
977 | if element == 'description': |
---|
978 | element = 'subtitle' |
---|
979 | context[element] = output |
---|
980 | if element == 'link': |
---|
981 | # fix query variables; see above for the explanation |
---|
982 | output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output) |
---|
983 | context[element] = output |
---|
984 | context['links'][-1]['href'] = output |
---|
985 | elif self.incontent: |
---|
986 | contentparams = copy.deepcopy(self.contentparams) |
---|
987 | contentparams['value'] = output |
---|
988 | context[element + '_detail'] = contentparams |
---|
989 | return output |
---|
990 | |
---|
991 | def pushContent(self, tag, attrsD, defaultContentType, expectingText): |
---|
992 | self.incontent += 1 |
---|
993 | if self.lang: |
---|
994 | self.lang=self.lang.replace('_','-') |
---|
995 | self.contentparams = FeedParserDict({ |
---|
996 | 'type': self.mapContentType(attrsD.get('type', defaultContentType)), |
---|
997 | 'language': self.lang, |
---|
998 | 'base': self.baseuri}) |
---|
999 | self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams) |
---|
1000 | self.push(tag, expectingText) |
---|
1001 | |
---|
1002 | def popContent(self, tag): |
---|
1003 | value = self.pop(tag) |
---|
1004 | self.incontent -= 1 |
---|
1005 | self.contentparams.clear() |
---|
1006 | return value |
---|
1007 | |
---|
1008 | # a number of elements in a number of RSS variants are nominally plain |
---|
1009 | # text, but this is routinely ignored. This is an attempt to detect |
---|
1010 | # the most common cases. As false positives often result in silent |
---|
1011 | # data loss, this function errs on the conservative side. |
---|
1012 | @staticmethod |
---|
1013 | def lookslikehtml(s): |
---|
1014 | # must have a close tag or an entity reference to qualify |
---|
1015 | if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)): |
---|
1016 | return |
---|
1017 | |
---|
1018 | # all tags must be in a restricted subset of valid HTML tags |
---|
1019 | if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements, |
---|
1020 | re.findall(r'</?(\w+)',s)): |
---|
1021 | return |
---|
1022 | |
---|
1023 | # all entities must have been defined as valid HTML entities |
---|
1024 | if filter(lambda e: e not in entitydefs.keys(), re.findall(r'&(\w+);', s)): |
---|
1025 | return |
---|
1026 | |
---|
1027 | return 1 |
---|
1028 | |
---|
1029 | def _mapToStandardPrefix(self, name): |
---|
1030 | colonpos = name.find(':') |
---|
1031 | if colonpos <> -1: |
---|
1032 | prefix = name[:colonpos] |
---|
1033 | suffix = name[colonpos+1:] |
---|
1034 | prefix = self.namespacemap.get(prefix, prefix) |
---|
1035 | name = prefix + ':' + suffix |
---|
1036 | return name |
---|
1037 | |
---|
1038 | def _getAttribute(self, attrsD, name): |
---|
1039 | return attrsD.get(self._mapToStandardPrefix(name)) |
---|
1040 | |
---|
1041 | def _isBase64(self, attrsD, contentparams): |
---|
1042 | if attrsD.get('mode', '') == 'base64': |
---|
1043 | return 1 |
---|
1044 | if self.contentparams['type'].startswith(u'text/'): |
---|
1045 | return 0 |
---|
1046 | if self.contentparams['type'].endswith(u'+xml'): |
---|
1047 | return 0 |
---|
1048 | if self.contentparams['type'].endswith(u'/xml'): |
---|
1049 | return 0 |
---|
1050 | return 1 |
---|
1051 | |
---|
1052 | def _itsAnHrefDamnIt(self, attrsD): |
---|
1053 | href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None))) |
---|
1054 | if href: |
---|
1055 | try: |
---|
1056 | del attrsD['url'] |
---|
1057 | except KeyError: |
---|
1058 | pass |
---|
1059 | try: |
---|
1060 | del attrsD['uri'] |
---|
1061 | except KeyError: |
---|
1062 | pass |
---|
1063 | attrsD['href'] = href |
---|
1064 | return attrsD |
---|
1065 | |
---|
1066 | def _save(self, key, value, overwrite=False): |
---|
1067 | context = self._getContext() |
---|
1068 | if overwrite: |
---|
1069 | context[key] = value |
---|
1070 | else: |
---|
1071 | context.setdefault(key, value) |
---|
1072 | |
---|
1073 | def _start_rss(self, attrsD): |
---|
1074 | versionmap = {'0.91': u'rss091u', |
---|
1075 | '0.92': u'rss092', |
---|
1076 | '0.93': u'rss093', |
---|
1077 | '0.94': u'rss094'} |
---|
1078 | #If we're here then this is an RSS feed. |
---|
1079 | #If we don't have a version or have a version that starts with something |
---|
1080 | #other than RSS then there's been a mistake. Correct it. |
---|
1081 | if not self.version or not self.version.startswith(u'rss'): |
---|
1082 | attr_version = attrsD.get('version', '') |
---|
1083 | version = versionmap.get(attr_version) |
---|
1084 | if version: |
---|
1085 | self.version = version |
---|
1086 | elif attr_version.startswith('2.'): |
---|
1087 | self.version = u'rss20' |
---|
1088 | else: |
---|
1089 | self.version = u'rss' |
---|
1090 | |
---|
1091 | def _start_channel(self, attrsD): |
---|
1092 | self.infeed = 1 |
---|
1093 | self._cdf_common(attrsD) |
---|
1094 | |
---|
1095 | def _cdf_common(self, attrsD): |
---|
1096 | if 'lastmod' in attrsD: |
---|
1097 | self._start_modified({}) |
---|
1098 | self.elementstack[-1][-1] = attrsD['lastmod'] |
---|
1099 | self._end_modified() |
---|
1100 | if 'href' in attrsD: |
---|
1101 | self._start_link({}) |
---|
1102 | self.elementstack[-1][-1] = attrsD['href'] |
---|
1103 | self._end_link() |
---|
1104 | |
---|
1105 | def _start_feed(self, attrsD): |
---|
1106 | self.infeed = 1 |
---|
1107 | versionmap = {'0.1': u'atom01', |
---|
1108 | '0.2': u'atom02', |
---|
1109 | '0.3': u'atom03'} |
---|
1110 | if not self.version: |
---|
1111 | attr_version = attrsD.get('version') |
---|
1112 | version = versionmap.get(attr_version) |
---|
1113 | if version: |
---|
1114 | self.version = version |
---|
1115 | else: |
---|
1116 | self.version = u'atom' |
---|
1117 | |
---|
1118 | def _end_channel(self): |
---|
1119 | self.infeed = 0 |
---|
1120 | _end_feed = _end_channel |
---|
1121 | |
---|
1122 | def _start_image(self, attrsD): |
---|
1123 | context = self._getContext() |
---|
1124 | if not self.inentry: |
---|
1125 | context.setdefault('image', FeedParserDict()) |
---|
1126 | self.inimage = 1 |
---|
1127 | self.title_depth = -1 |
---|
1128 | self.push('image', 0) |
---|
1129 | |
---|
1130 | def _end_image(self): |
---|
1131 | self.pop('image') |
---|
1132 | self.inimage = 0 |
---|
1133 | |
---|
1134 | def _start_textinput(self, attrsD): |
---|
1135 | context = self._getContext() |
---|
1136 | context.setdefault('textinput', FeedParserDict()) |
---|
1137 | self.intextinput = 1 |
---|
1138 | self.title_depth = -1 |
---|
1139 | self.push('textinput', 0) |
---|
1140 | _start_textInput = _start_textinput |
---|
1141 | |
---|
1142 | def _end_textinput(self): |
---|
1143 | self.pop('textinput') |
---|
1144 | self.intextinput = 0 |
---|
1145 | _end_textInput = _end_textinput |
---|
1146 | |
---|
1147 | def _start_author(self, attrsD): |
---|
1148 | self.inauthor = 1 |
---|
1149 | self.push('author', 1) |
---|
1150 | # Append a new FeedParserDict when expecting an author |
---|
1151 | context = self._getContext() |
---|
1152 | context.setdefault('authors', []) |
---|
1153 | context['authors'].append(FeedParserDict()) |
---|
1154 | _start_managingeditor = _start_author |
---|
1155 | _start_dc_author = _start_author |
---|
1156 | _start_dc_creator = _start_author |
---|
1157 | _start_itunes_author = _start_author |
---|
1158 | |
---|
1159 | def _end_author(self): |
---|
1160 | self.pop('author') |
---|
1161 | self.inauthor = 0 |
---|
1162 | self._sync_author_detail() |
---|
1163 | _end_managingeditor = _end_author |
---|
1164 | _end_dc_author = _end_author |
---|
1165 | _end_dc_creator = _end_author |
---|
1166 | _end_itunes_author = _end_author |
---|
1167 | |
---|
1168 | def _start_itunes_owner(self, attrsD): |
---|
1169 | self.inpublisher = 1 |
---|
1170 | self.push('publisher', 0) |
---|
1171 | |
---|
1172 | def _end_itunes_owner(self): |
---|
1173 | self.pop('publisher') |
---|
1174 | self.inpublisher = 0 |
---|
1175 | self._sync_author_detail('publisher') |
---|
1176 | |
---|
1177 | def _start_contributor(self, attrsD): |
---|
1178 | self.incontributor = 1 |
---|
1179 | context = self._getContext() |
---|
1180 | context.setdefault('contributors', []) |
---|
1181 | context['contributors'].append(FeedParserDict()) |
---|
1182 | self.push('contributor', 0) |
---|
1183 | |
---|
1184 | def _end_contributor(self): |
---|
1185 | self.pop('contributor') |
---|
1186 | self.incontributor = 0 |
---|
1187 | |
---|
1188 | def _start_dc_contributor(self, attrsD): |
---|
1189 | self.incontributor = 1 |
---|
1190 | context = self._getContext() |
---|
1191 | context.setdefault('contributors', []) |
---|
1192 | context['contributors'].append(FeedParserDict()) |
---|
1193 | self.push('name', 0) |
---|
1194 | |
---|
1195 | def _end_dc_contributor(self): |
---|
1196 | self._end_name() |
---|
1197 | self.incontributor = 0 |
---|
1198 | |
---|
1199 | def _start_name(self, attrsD): |
---|
1200 | self.push('name', 0) |
---|
1201 | _start_itunes_name = _start_name |
---|
1202 | |
---|
1203 | def _end_name(self): |
---|
1204 | value = self.pop('name') |
---|
1205 | if self.inpublisher: |
---|
1206 | self._save_author('name', value, 'publisher') |
---|
1207 | elif self.inauthor: |
---|
1208 | self._save_author('name', value) |
---|
1209 | elif self.incontributor: |
---|
1210 | self._save_contributor('name', value) |
---|
1211 | elif self.intextinput: |
---|
1212 | context = self._getContext() |
---|
1213 | context['name'] = value |
---|
1214 | _end_itunes_name = _end_name |
---|
1215 | |
---|
1216 | def _start_width(self, attrsD): |
---|
1217 | self.push('width', 0) |
---|
1218 | |
---|
1219 | def _end_width(self): |
---|
1220 | value = self.pop('width') |
---|
1221 | try: |
---|
1222 | value = int(value) |
---|
1223 | except ValueError: |
---|
1224 | value = 0 |
---|
1225 | if self.inimage: |
---|
1226 | context = self._getContext() |
---|
1227 | context['width'] = value |
---|
1228 | |
---|
1229 | def _start_height(self, attrsD): |
---|
1230 | self.push('height', 0) |
---|
1231 | |
---|
1232 | def _end_height(self): |
---|
1233 | value = self.pop('height') |
---|
1234 | try: |
---|
1235 | value = int(value) |
---|
1236 | except ValueError: |
---|
1237 | value = 0 |
---|
1238 | if self.inimage: |
---|
1239 | context = self._getContext() |
---|
1240 | context['height'] = value |
---|
1241 | |
---|
1242 | def _start_url(self, attrsD): |
---|
1243 | self.push('href', 1) |
---|
1244 | _start_homepage = _start_url |
---|
1245 | _start_uri = _start_url |
---|
1246 | |
---|
1247 | def _end_url(self): |
---|
1248 | value = self.pop('href') |
---|
1249 | if self.inauthor: |
---|
1250 | self._save_author('href', value) |
---|
1251 | elif self.incontributor: |
---|
1252 | self._save_contributor('href', value) |
---|
1253 | _end_homepage = _end_url |
---|
1254 | _end_uri = _end_url |
---|
1255 | |
---|
1256 | def _start_email(self, attrsD): |
---|
1257 | self.push('email', 0) |
---|
1258 | _start_itunes_email = _start_email |
---|
1259 | |
---|
1260 | def _end_email(self): |
---|
1261 | value = self.pop('email') |
---|
1262 | if self.inpublisher: |
---|
1263 | self._save_author('email', value, 'publisher') |
---|
1264 | elif self.inauthor: |
---|
1265 | self._save_author('email', value) |
---|
1266 | elif self.incontributor: |
---|
1267 | self._save_contributor('email', value) |
---|
1268 | _end_itunes_email = _end_email |
---|
1269 | |
---|
1270 | def _getContext(self): |
---|
1271 | if self.insource: |
---|
1272 | context = self.sourcedata |
---|
1273 | elif self.inimage and 'image' in self.feeddata: |
---|
1274 | context = self.feeddata['image'] |
---|
1275 | elif self.intextinput: |
---|
1276 | context = self.feeddata['textinput'] |
---|
1277 | elif self.inentry: |
---|
1278 | context = self.entries[-1] |
---|
1279 | else: |
---|
1280 | context = self.feeddata |
---|
1281 | return context |
---|
1282 | |
---|
1283 | def _save_author(self, key, value, prefix='author'): |
---|
1284 | context = self._getContext() |
---|
1285 | context.setdefault(prefix + '_detail', FeedParserDict()) |
---|
1286 | context[prefix + '_detail'][key] = value |
---|
1287 | self._sync_author_detail() |
---|
1288 | context.setdefault('authors', [FeedParserDict()]) |
---|
1289 | context['authors'][-1][key] = value |
---|
1290 | |
---|
1291 | def _save_contributor(self, key, value): |
---|
1292 | context = self._getContext() |
---|
1293 | context.setdefault('contributors', [FeedParserDict()]) |
---|
1294 | context['contributors'][-1][key] = value |
---|
1295 | |
---|
1296 | def _sync_author_detail(self, key='author'): |
---|
1297 | context = self._getContext() |
---|
1298 | detail = context.get('%ss' % key, [FeedParserDict()])[-1] |
---|
1299 | if detail: |
---|
1300 | name = detail.get('name') |
---|
1301 | email = detail.get('email') |
---|
1302 | if name and email: |
---|
1303 | context[key] = u'%s (%s)' % (name, email) |
---|
1304 | elif name: |
---|
1305 | context[key] = name |
---|
1306 | elif email: |
---|
1307 | context[key] = email |
---|
1308 | else: |
---|
1309 | author, email = context.get(key), None |
---|
1310 | if not author: |
---|
1311 | return |
---|
1312 | emailmatch = re.search(ur'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author) |
---|
1313 | if emailmatch: |
---|
1314 | email = emailmatch.group(0) |
---|
1315 | # probably a better way to do the following, but it passes all the tests |
---|
1316 | author = author.replace(email, u'') |
---|
1317 | author = author.replace(u'()', u'') |
---|
1318 | author = author.replace(u'<>', u'') |
---|
1319 | author = author.replace(u'<>', u'') |
---|
1320 | author = author.strip() |
---|
1321 | if author and (author[0] == u'('): |
---|
1322 | author = author[1:] |
---|
1323 | if author and (author[-1] == u')'): |
---|
1324 | author = author[:-1] |
---|
1325 | author = author.strip() |
---|
1326 | if author or email: |
---|
1327 | context.setdefault('%s_detail' % key, detail) |
---|
1328 | if author: |
---|
1329 | detail['name'] = author |
---|
1330 | if email: |
---|
1331 | detail['email'] = email |
---|
1332 | |
---|
1333 | def _start_subtitle(self, attrsD): |
---|
1334 | self.pushContent('subtitle', attrsD, u'text/plain', 1) |
---|
1335 | _start_tagline = _start_subtitle |
---|
1336 | _start_itunes_subtitle = _start_subtitle |
---|
1337 | |
---|
1338 | def _end_subtitle(self): |
---|
1339 | self.popContent('subtitle') |
---|
1340 | _end_tagline = _end_subtitle |
---|
1341 | _end_itunes_subtitle = _end_subtitle |
---|
1342 | |
---|
1343 | def _start_rights(self, attrsD): |
---|
1344 | self.pushContent('rights', attrsD, u'text/plain', 1) |
---|
1345 | _start_dc_rights = _start_rights |
---|
1346 | _start_copyright = _start_rights |
---|
1347 | |
---|
1348 | def _end_rights(self): |
---|
1349 | self.popContent('rights') |
---|
1350 | _end_dc_rights = _end_rights |
---|
1351 | _end_copyright = _end_rights |
---|
1352 | |
---|
1353 | def _start_item(self, attrsD): |
---|
1354 | self.entries.append(FeedParserDict()) |
---|
1355 | self.push('item', 0) |
---|
1356 | self.inentry = 1 |
---|
1357 | self.guidislink = 0 |
---|
1358 | self.title_depth = -1 |
---|
1359 | self.psc_chapters_flag = None |
---|
1360 | id = self._getAttribute(attrsD, 'rdf:about') |
---|
1361 | if id: |
---|
1362 | context = self._getContext() |
---|
1363 | context['id'] = id |
---|
1364 | self._cdf_common(attrsD) |
---|
1365 | _start_entry = _start_item |
---|
1366 | |
---|
1367 | def _end_item(self): |
---|
1368 | self.pop('item') |
---|
1369 | self.inentry = 0 |
---|
1370 | _end_entry = _end_item |
---|
1371 | |
---|
1372 | def _start_dc_language(self, attrsD): |
---|
1373 | self.push('language', 1) |
---|
1374 | _start_language = _start_dc_language |
---|
1375 | |
---|
1376 | def _end_dc_language(self): |
---|
1377 | self.lang = self.pop('language') |
---|
1378 | _end_language = _end_dc_language |
---|
1379 | |
---|
1380 | def _start_dc_publisher(self, attrsD): |
---|
1381 | self.push('publisher', 1) |
---|
1382 | _start_webmaster = _start_dc_publisher |
---|
1383 | |
---|
1384 | def _end_dc_publisher(self): |
---|
1385 | self.pop('publisher') |
---|
1386 | self._sync_author_detail('publisher') |
---|
1387 | _end_webmaster = _end_dc_publisher |
---|
1388 | |
---|
1389 | def _start_dcterms_valid(self, attrsD): |
---|
1390 | self.push('validity', 1) |
---|
1391 | |
---|
1392 | def _end_dcterms_valid(self): |
---|
1393 | for validity_detail in self.pop('validity').split(';'): |
---|
1394 | if '=' in validity_detail: |
---|
1395 | key, value = validity_detail.split('=', 1) |
---|
1396 | if key == 'start': |
---|
1397 | self._save('validity_start', value, overwrite=True) |
---|
1398 | self._save('validity_start_parsed', _parse_date(value), overwrite=True) |
---|
1399 | elif key == 'end': |
---|
1400 | self._save('validity_end', value, overwrite=True) |
---|
1401 | self._save('validity_end_parsed', _parse_date(value), overwrite=True) |
---|
1402 | |
---|
1403 | def _start_published(self, attrsD): |
---|
1404 | self.push('published', 1) |
---|
1405 | _start_dcterms_issued = _start_published |
---|
1406 | _start_issued = _start_published |
---|
1407 | _start_pubdate = _start_published |
---|
1408 | |
---|
1409 | def _end_published(self): |
---|
1410 | value = self.pop('published') |
---|
1411 | self._save('published_parsed', _parse_date(value), overwrite=True) |
---|
1412 | _end_dcterms_issued = _end_published |
---|
1413 | _end_issued = _end_published |
---|
1414 | _end_pubdate = _end_published |
---|
1415 | |
---|
1416 | def _start_updated(self, attrsD): |
---|
1417 | self.push('updated', 1) |
---|
1418 | _start_modified = _start_updated |
---|
1419 | _start_dcterms_modified = _start_updated |
---|
1420 | _start_dc_date = _start_updated |
---|
1421 | _start_lastbuilddate = _start_updated |
---|
1422 | |
---|
1423 | def _end_updated(self): |
---|
1424 | value = self.pop('updated') |
---|
1425 | parsed_value = _parse_date(value) |
---|
1426 | self._save('updated_parsed', parsed_value, overwrite=True) |
---|
1427 | _end_modified = _end_updated |
---|
1428 | _end_dcterms_modified = _end_updated |
---|
1429 | _end_dc_date = _end_updated |
---|
1430 | _end_lastbuilddate = _end_updated |
---|
1431 | |
---|
1432 | def _start_created(self, attrsD): |
---|
1433 | self.push('created', 1) |
---|
1434 | _start_dcterms_created = _start_created |
---|
1435 | |
---|
1436 | def _end_created(self): |
---|
1437 | value = self.pop('created') |
---|
1438 | self._save('created_parsed', _parse_date(value), overwrite=True) |
---|
1439 | _end_dcterms_created = _end_created |
---|
1440 | |
---|
1441 | def _start_expirationdate(self, attrsD): |
---|
1442 | self.push('expired', 1) |
---|
1443 | |
---|
1444 | def _end_expirationdate(self): |
---|
1445 | self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True) |
---|
1446 | |
---|
1447 | # geospatial location, or "where", from georss.org |
---|
1448 | |
---|
1449 | def _start_georssgeom(self, attrsD): |
---|
1450 | self.push('geometry', 0) |
---|
1451 | context = self._getContext() |
---|
1452 | context['where'] = FeedParserDict() |
---|
1453 | |
---|
1454 | _start_georss_point = _start_georssgeom |
---|
1455 | _start_georss_line = _start_georssgeom |
---|
1456 | _start_georss_polygon = _start_georssgeom |
---|
1457 | _start_georss_box = _start_georssgeom |
---|
1458 | |
---|
1459 | def _save_where(self, geometry): |
---|
1460 | context = self._getContext() |
---|
1461 | context['where'].update(geometry) |
---|
1462 | |
---|
1463 | def _end_georss_point(self): |
---|
1464 | geometry = _parse_georss_point(self.pop('geometry')) |
---|
1465 | if geometry: |
---|
1466 | self._save_where(geometry) |
---|
1467 | |
---|
1468 | def _end_georss_line(self): |
---|
1469 | geometry = _parse_georss_line(self.pop('geometry')) |
---|
1470 | if geometry: |
---|
1471 | self._save_where(geometry) |
---|
1472 | |
---|
1473 | def _end_georss_polygon(self): |
---|
1474 | this = self.pop('geometry') |
---|
1475 | geometry = _parse_georss_polygon(this) |
---|
1476 | if geometry: |
---|
1477 | self._save_where(geometry) |
---|
1478 | |
---|
1479 | def _end_georss_box(self): |
---|
1480 | geometry = _parse_georss_box(self.pop('geometry')) |
---|
1481 | if geometry: |
---|
1482 | self._save_where(geometry) |
---|
1483 | |
---|
1484 | def _start_where(self, attrsD): |
---|
1485 | self.push('where', 0) |
---|
1486 | context = self._getContext() |
---|
1487 | context['where'] = FeedParserDict() |
---|
1488 | _start_georss_where = _start_where |
---|
1489 | |
---|
1490 | def _parse_srs_attrs(self, attrsD): |
---|
1491 | srsName = attrsD.get('srsname') |
---|
1492 | try: |
---|
1493 | srsDimension = int(attrsD.get('srsdimension', '2')) |
---|
1494 | except ValueError: |
---|
1495 | srsDimension = 2 |
---|
1496 | context = self._getContext() |
---|
1497 | context['where']['srsName'] = srsName |
---|
1498 | context['where']['srsDimension'] = srsDimension |
---|
1499 | |
---|
1500 | def _start_gml_point(self, attrsD): |
---|
1501 | self._parse_srs_attrs(attrsD) |
---|
1502 | self.ingeometry = 1 |
---|
1503 | self.push('geometry', 0) |
---|
1504 | |
---|
1505 | def _start_gml_linestring(self, attrsD): |
---|
1506 | self._parse_srs_attrs(attrsD) |
---|
1507 | self.ingeometry = 'linestring' |
---|
1508 | self.push('geometry', 0) |
---|
1509 | |
---|
1510 | def _start_gml_polygon(self, attrsD): |
---|
1511 | self._parse_srs_attrs(attrsD) |
---|
1512 | self.push('geometry', 0) |
---|
1513 | |
---|
1514 | def _start_gml_exterior(self, attrsD): |
---|
1515 | self.push('geometry', 0) |
---|
1516 | |
---|
1517 | def _start_gml_linearring(self, attrsD): |
---|
1518 | self.ingeometry = 'polygon' |
---|
1519 | self.push('geometry', 0) |
---|
1520 | |
---|
1521 | def _start_gml_pos(self, attrsD): |
---|
1522 | self.push('pos', 0) |
---|
1523 | |
---|
1524 | def _end_gml_pos(self): |
---|
1525 | this = self.pop('pos') |
---|
1526 | context = self._getContext() |
---|
1527 | srsName = context['where'].get('srsName') |
---|
1528 | srsDimension = context['where'].get('srsDimension', 2) |
---|
1529 | swap = True |
---|
1530 | if srsName and "EPSG" in srsName: |
---|
1531 | epsg = int(srsName.split(":")[-1]) |
---|
1532 | swap = bool(epsg in _geogCS) |
---|
1533 | geometry = _parse_georss_point(this, swap=swap, dims=srsDimension) |
---|
1534 | if geometry: |
---|
1535 | self._save_where(geometry) |
---|
1536 | |
---|
1537 | def _start_gml_poslist(self, attrsD): |
---|
1538 | self.push('pos', 0) |
---|
1539 | |
---|
1540 | def _end_gml_poslist(self): |
---|
1541 | this = self.pop('pos') |
---|
1542 | context = self._getContext() |
---|
1543 | srsName = context['where'].get('srsName') |
---|
1544 | srsDimension = context['where'].get('srsDimension', 2) |
---|
1545 | swap = True |
---|
1546 | if srsName and "EPSG" in srsName: |
---|
1547 | epsg = int(srsName.split(":")[-1]) |
---|
1548 | swap = bool(epsg in _geogCS) |
---|
1549 | geometry = _parse_poslist( |
---|
1550 | this, self.ingeometry, swap=swap, dims=srsDimension) |
---|
1551 | if geometry: |
---|
1552 | self._save_where(geometry) |
---|
1553 | |
---|
1554 | def _end_geom(self): |
---|
1555 | self.ingeometry = 0 |
---|
1556 | self.pop('geometry') |
---|
1557 | _end_gml_point = _end_geom |
---|
1558 | _end_gml_linestring = _end_geom |
---|
1559 | _end_gml_linearring = _end_geom |
---|
1560 | _end_gml_exterior = _end_geom |
---|
1561 | _end_gml_polygon = _end_geom |
---|
1562 | |
---|
1563 | def _end_where(self): |
---|
1564 | self.pop('where') |
---|
1565 | _end_georss_where = _end_where |
---|
1566 | |
---|
1567 | # end geospatial |
---|
1568 | |
---|
1569 | def _start_cc_license(self, attrsD): |
---|
1570 | context = self._getContext() |
---|
1571 | value = self._getAttribute(attrsD, 'rdf:resource') |
---|
1572 | attrsD = FeedParserDict() |
---|
1573 | attrsD['rel'] = u'license' |
---|
1574 | if value: |
---|
1575 | attrsD['href']=value |
---|
1576 | context.setdefault('links', []).append(attrsD) |
---|
1577 | |
---|
1578 | def _start_creativecommons_license(self, attrsD): |
---|
1579 | self.push('license', 1) |
---|
1580 | _start_creativeCommons_license = _start_creativecommons_license |
---|
1581 | |
---|
1582 | def _end_creativecommons_license(self): |
---|
1583 | value = self.pop('license') |
---|
1584 | context = self._getContext() |
---|
1585 | attrsD = FeedParserDict() |
---|
1586 | attrsD['rel'] = u'license' |
---|
1587 | if value: |
---|
1588 | attrsD['href'] = value |
---|
1589 | context.setdefault('links', []).append(attrsD) |
---|
1590 | del context['license'] |
---|
1591 | _end_creativeCommons_license = _end_creativecommons_license |
---|
1592 | |
---|
1593 | def _addTag(self, term, scheme, label): |
---|
1594 | context = self._getContext() |
---|
1595 | tags = context.setdefault('tags', []) |
---|
1596 | if (not term) and (not scheme) and (not label): |
---|
1597 | return |
---|
1598 | value = FeedParserDict(term=term, scheme=scheme, label=label) |
---|
1599 | if value not in tags: |
---|
1600 | tags.append(value) |
---|
1601 | |
---|
1602 | def _start_tags(self, attrsD): |
---|
1603 | # This is a completely-made up element. Its semantics are determined |
---|
1604 | # only by a single feed that precipitated bug report 392 on Google Code. |
---|
1605 | # In short, this is junk code. |
---|
1606 | self.push('tags', 1) |
---|
1607 | |
---|
1608 | def _end_tags(self): |
---|
1609 | for term in self.pop('tags').split(','): |
---|
1610 | self._addTag(term.strip(), None, None) |
---|
1611 | |
---|
1612 | def _start_category(self, attrsD): |
---|
1613 | term = attrsD.get('term') |
---|
1614 | scheme = attrsD.get('scheme', attrsD.get('domain')) |
---|
1615 | label = attrsD.get('label') |
---|
1616 | self._addTag(term, scheme, label) |
---|
1617 | self.push('category', 1) |
---|
1618 | _start_dc_subject = _start_category |
---|
1619 | _start_keywords = _start_category |
---|
1620 | |
---|
1621 | def _start_media_category(self, attrsD): |
---|
1622 | attrsD.setdefault('scheme', u'http://search.yahoo.com/mrss/category_schema') |
---|
1623 | self._start_category(attrsD) |
---|
1624 | |
---|
1625 | def _end_itunes_keywords(self): |
---|
1626 | for term in self.pop('itunes_keywords').split(','): |
---|
1627 | if term.strip(): |
---|
1628 | self._addTag(term.strip(), u'http://www.itunes.com/', None) |
---|
1629 | |
---|
1630 | def _end_media_keywords(self): |
---|
1631 | for term in self.pop('media_keywords').split(','): |
---|
1632 | if term.strip(): |
---|
1633 | self._addTag(term.strip(), None, None) |
---|
1634 | |
---|
1635 | def _start_itunes_category(self, attrsD): |
---|
1636 | self._addTag(attrsD.get('text'), u'http://www.itunes.com/', None) |
---|
1637 | self.push('category', 1) |
---|
1638 | |
---|
1639 | def _end_category(self): |
---|
1640 | value = self.pop('category') |
---|
1641 | if not value: |
---|
1642 | return |
---|
1643 | context = self._getContext() |
---|
1644 | tags = context['tags'] |
---|
1645 | if value and len(tags) and not tags[-1]['term']: |
---|
1646 | tags[-1]['term'] = value |
---|
1647 | else: |
---|
1648 | self._addTag(value, None, None) |
---|
1649 | _end_dc_subject = _end_category |
---|
1650 | _end_keywords = _end_category |
---|
1651 | _end_itunes_category = _end_category |
---|
1652 | _end_media_category = _end_category |
---|
1653 | |
---|
1654 | def _start_cloud(self, attrsD): |
---|
1655 | self._getContext()['cloud'] = FeedParserDict(attrsD) |
---|
1656 | |
---|
1657 | def _start_link(self, attrsD): |
---|
1658 | attrsD.setdefault('rel', u'alternate') |
---|
1659 | if attrsD['rel'] == u'self': |
---|
1660 | attrsD.setdefault('type', u'application/atom+xml') |
---|
1661 | else: |
---|
1662 | attrsD.setdefault('type', u'text/html') |
---|
1663 | context = self._getContext() |
---|
1664 | attrsD = self._itsAnHrefDamnIt(attrsD) |
---|
1665 | if 'href' in attrsD: |
---|
1666 | attrsD['href'] = self.resolveURI(attrsD['href']) |
---|
1667 | expectingText = self.infeed or self.inentry or self.insource |
---|
1668 | context.setdefault('links', []) |
---|
1669 | if not (self.inentry and self.inimage): |
---|
1670 | context['links'].append(FeedParserDict(attrsD)) |
---|
1671 | if 'href' in attrsD: |
---|
1672 | expectingText = 0 |
---|
1673 | if (attrsD.get('rel') == u'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): |
---|
1674 | context['link'] = attrsD['href'] |
---|
1675 | else: |
---|
1676 | self.push('link', expectingText) |
---|
1677 | |
---|
1678 | def _end_link(self): |
---|
1679 | value = self.pop('link') |
---|
1680 | |
---|
1681 | def _start_guid(self, attrsD): |
---|
1682 | self.guidislink = (attrsD.get('ispermalink', 'true') == 'true') |
---|
1683 | self.push('id', 1) |
---|
1684 | _start_id = _start_guid |
---|
1685 | |
---|
1686 | def _end_guid(self): |
---|
1687 | value = self.pop('id') |
---|
1688 | self._save('guidislink', self.guidislink and 'link' not in self._getContext()) |
---|
1689 | if self.guidislink: |
---|
1690 | # guid acts as link, but only if 'ispermalink' is not present or is 'true', |
---|
1691 | # and only if the item doesn't already have a link element |
---|
1692 | self._save('link', value) |
---|
1693 | _end_id = _end_guid |
---|
1694 | |
---|
1695 | def _start_title(self, attrsD): |
---|
1696 | if self.svgOK: |
---|
1697 | return self.unknown_starttag('title', attrsD.items()) |
---|
1698 | self.pushContent('title', attrsD, u'text/plain', self.infeed or self.inentry or self.insource) |
---|
1699 | _start_dc_title = _start_title |
---|
1700 | _start_media_title = _start_title |
---|
1701 | |
---|
1702 | def _end_title(self): |
---|
1703 | if self.svgOK: |
---|
1704 | return |
---|
1705 | value = self.popContent('title') |
---|
1706 | if not value: |
---|
1707 | return |
---|
1708 | self.title_depth = self.depth |
---|
1709 | _end_dc_title = _end_title |
---|
1710 | |
---|
1711 | def _end_media_title(self): |
---|
1712 | title_depth = self.title_depth |
---|
1713 | self._end_title() |
---|
1714 | self.title_depth = title_depth |
---|
1715 | |
---|
1716 | def _start_description(self, attrsD): |
---|
1717 | context = self._getContext() |
---|
1718 | if 'summary' in context: |
---|
1719 | self._summaryKey = 'content' |
---|
1720 | self._start_content(attrsD) |
---|
1721 | else: |
---|
1722 | self.pushContent('description', attrsD, u'text/html', self.infeed or self.inentry or self.insource) |
---|
1723 | _start_dc_description = _start_description |
---|
1724 | _start_media_description = _start_description |
---|
1725 | |
---|
1726 | def _start_abstract(self, attrsD): |
---|
1727 | self.pushContent('description', attrsD, u'text/plain', self.infeed or self.inentry or self.insource) |
---|
1728 | |
---|
1729 | def _end_description(self): |
---|
1730 | if self._summaryKey == 'content': |
---|
1731 | self._end_content() |
---|
1732 | else: |
---|
1733 | value = self.popContent('description') |
---|
1734 | self._summaryKey = None |
---|
1735 | _end_abstract = _end_description |
---|
1736 | _end_dc_description = _end_description |
---|
1737 | _end_media_description = _end_description |
---|
1738 | |
---|
1739 | def _start_info(self, attrsD): |
---|
1740 | self.pushContent('info', attrsD, u'text/plain', 1) |
---|
1741 | _start_feedburner_browserfriendly = _start_info |
---|
1742 | |
---|
1743 | def _end_info(self): |
---|
1744 | self.popContent('info') |
---|
1745 | _end_feedburner_browserfriendly = _end_info |
---|
1746 | |
---|
1747 | def _start_generator(self, attrsD): |
---|
1748 | if attrsD: |
---|
1749 | attrsD = self._itsAnHrefDamnIt(attrsD) |
---|
1750 | if 'href' in attrsD: |
---|
1751 | attrsD['href'] = self.resolveURI(attrsD['href']) |
---|
1752 | self._getContext()['generator_detail'] = FeedParserDict(attrsD) |
---|
1753 | self.push('generator', 1) |
---|
1754 | |
---|
1755 | def _end_generator(self): |
---|
1756 | value = self.pop('generator') |
---|
1757 | context = self._getContext() |
---|
1758 | if 'generator_detail' in context: |
---|
1759 | context['generator_detail']['name'] = value |
---|
1760 | |
---|
1761 | def _start_admin_generatoragent(self, attrsD): |
---|
1762 | self.push('generator', 1) |
---|
1763 | value = self._getAttribute(attrsD, 'rdf:resource') |
---|
1764 | if value: |
---|
1765 | self.elementstack[-1][2].append(value) |
---|
1766 | self.pop('generator') |
---|
1767 | self._getContext()['generator_detail'] = FeedParserDict({'href': value}) |
---|
1768 | |
---|
1769 | def _start_admin_errorreportsto(self, attrsD): |
---|
1770 | self.push('errorreportsto', 1) |
---|
1771 | value = self._getAttribute(attrsD, 'rdf:resource') |
---|
1772 | if value: |
---|
1773 | self.elementstack[-1][2].append(value) |
---|
1774 | self.pop('errorreportsto') |
---|
1775 | |
---|
1776 | def _start_summary(self, attrsD): |
---|
1777 | context = self._getContext() |
---|
1778 | if 'summary' in context: |
---|
1779 | self._summaryKey = 'content' |
---|
1780 | self._start_content(attrsD) |
---|
1781 | else: |
---|
1782 | self._summaryKey = 'summary' |
---|
1783 | self.pushContent(self._summaryKey, attrsD, u'text/plain', 1) |
---|
1784 | _start_itunes_summary = _start_summary |
---|
1785 | |
---|
1786 | def _end_summary(self): |
---|
1787 | if self._summaryKey == 'content': |
---|
1788 | self._end_content() |
---|
1789 | else: |
---|
1790 | self.popContent(self._summaryKey or 'summary') |
---|
1791 | self._summaryKey = None |
---|
1792 | _end_itunes_summary = _end_summary |
---|
1793 | |
---|
1794 | def _start_enclosure(self, attrsD): |
---|
1795 | attrsD = self._itsAnHrefDamnIt(attrsD) |
---|
1796 | context = self._getContext() |
---|
1797 | attrsD['rel'] = u'enclosure' |
---|
1798 | context.setdefault('links', []).append(FeedParserDict(attrsD)) |
---|
1799 | |
---|
1800 | def _start_source(self, attrsD): |
---|
1801 | if 'url' in attrsD: |
---|
1802 | # This means that we're processing a source element from an RSS 2.0 feed |
---|
1803 | self.sourcedata['href'] = attrsD[u'url'] |
---|
1804 | self.push('source', 1) |
---|
1805 | self.insource = 1 |
---|
1806 | self.title_depth = -1 |
---|
1807 | |
---|
1808 | def _end_source(self): |
---|
1809 | self.insource = 0 |
---|
1810 | value = self.pop('source') |
---|
1811 | if value: |
---|
1812 | self.sourcedata['title'] = value |
---|
1813 | self._getContext()['source'] = copy.deepcopy(self.sourcedata) |
---|
1814 | self.sourcedata.clear() |
---|
1815 | |
---|
1816 | def _start_content(self, attrsD): |
---|
1817 | self.pushContent('content', attrsD, u'text/plain', 1) |
---|
1818 | src = attrsD.get('src') |
---|
1819 | if src: |
---|
1820 | self.contentparams['src'] = src |
---|
1821 | self.push('content', 1) |
---|
1822 | |
---|
1823 | def _start_body(self, attrsD): |
---|
1824 | self.pushContent('content', attrsD, u'application/xhtml+xml', 1) |
---|
1825 | _start_xhtml_body = _start_body |
---|
1826 | |
---|
1827 | def _start_content_encoded(self, attrsD): |
---|
1828 | self.pushContent('content', attrsD, u'text/html', 1) |
---|
1829 | _start_fullitem = _start_content_encoded |
---|
1830 | |
---|
1831 | def _end_content(self): |
---|
1832 | copyToSummary = self.mapContentType(self.contentparams.get('type')) in ([u'text/plain'] + self.html_types) |
---|
1833 | value = self.popContent('content') |
---|
1834 | if copyToSummary: |
---|
1835 | self._save('summary', value) |
---|
1836 | |
---|
1837 | _end_body = _end_content |
---|
1838 | _end_xhtml_body = _end_content |
---|
1839 | _end_content_encoded = _end_content |
---|
1840 | _end_fullitem = _end_content |
---|
1841 | |
---|
1842 | def _start_itunes_image(self, attrsD): |
---|
1843 | self.push('itunes_image', 0) |
---|
1844 | if attrsD.get('href'): |
---|
1845 | self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) |
---|
1846 | elif attrsD.get('url'): |
---|
1847 | self._getContext()['image'] = FeedParserDict({'href': attrsD.get('url')}) |
---|
1848 | _start_itunes_link = _start_itunes_image |
---|
1849 | |
---|
1850 | def _end_itunes_block(self): |
---|
1851 | value = self.pop('itunes_block', 0) |
---|
1852 | self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0 |
---|
1853 | |
---|
1854 | def _end_itunes_explicit(self): |
---|
1855 | value = self.pop('itunes_explicit', 0) |
---|
1856 | # Convert 'yes' -> True, 'clean' to False, and any other value to None |
---|
1857 | # False and None both evaluate as False, so the difference can be ignored |
---|
1858 | # by applications that only need to know if the content is explicit. |
---|
1859 | self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0] |
---|
1860 | |
---|
1861 | def _start_media_group(self, attrsD): |
---|
1862 | # don't do anything, but don't break the enclosed tags either |
---|
1863 | pass |
---|
1864 | |
---|
1865 | def _start_media_rating(self, attrsD): |
---|
1866 | context = self._getContext() |
---|
1867 | context.setdefault('media_rating', attrsD) |
---|
1868 | self.push('rating', 1) |
---|
1869 | |
---|
1870 | def _end_media_rating(self): |
---|
1871 | rating = self.pop('rating') |
---|
1872 | if rating is not None and rating.strip(): |
---|
1873 | context = self._getContext() |
---|
1874 | context['media_rating']['content'] = rating |
---|
1875 | |
---|
1876 | def _start_media_credit(self, attrsD): |
---|
1877 | context = self._getContext() |
---|
1878 | context.setdefault('media_credit', []) |
---|
1879 | context['media_credit'].append(attrsD) |
---|
1880 | self.push('credit', 1) |
---|
1881 | |
---|
1882 | def _end_media_credit(self): |
---|
1883 | credit = self.pop('credit') |
---|
1884 | if credit != None and len(credit.strip()) != 0: |
---|
1885 | context = self._getContext() |
---|
1886 | context['media_credit'][-1]['content'] = credit |
---|
1887 | |
---|
1888 | def _start_media_restriction(self, attrsD): |
---|
1889 | context = self._getContext() |
---|
1890 | context.setdefault('media_restriction', attrsD) |
---|
1891 | self.push('restriction', 1) |
---|
1892 | |
---|
1893 | def _end_media_restriction(self): |
---|
1894 | restriction = self.pop('restriction') |
---|
1895 | if restriction != None and len(restriction.strip()) != 0: |
---|
1896 | context = self._getContext() |
---|
1897 | context['media_restriction']['content'] = [cc.strip().lower() for cc in restriction.split(' ')] |
---|
1898 | |
---|
1899 | def _start_media_license(self, attrsD): |
---|
1900 | context = self._getContext() |
---|
1901 | context.setdefault('media_license', attrsD) |
---|
1902 | self.push('license', 1) |
---|
1903 | |
---|
1904 | def _end_media_license(self): |
---|
1905 | license = self.pop('license') |
---|
1906 | if license != None and len(license.strip()) != 0: |
---|
1907 | context = self._getContext() |
---|
1908 | context['media_license']['content'] = license |
---|
1909 | |
---|
1910 | def _start_media_content(self, attrsD): |
---|
1911 | context = self._getContext() |
---|
1912 | context.setdefault('media_content', []) |
---|
1913 | context['media_content'].append(attrsD) |
---|
1914 | |
---|
1915 | def _start_media_thumbnail(self, attrsD): |
---|
1916 | context = self._getContext() |
---|
1917 | context.setdefault('media_thumbnail', []) |
---|
1918 | self.push('url', 1) # new |
---|
1919 | context['media_thumbnail'].append(attrsD) |
---|
1920 | |
---|
1921 | def _end_media_thumbnail(self): |
---|
1922 | url = self.pop('url') |
---|
1923 | context = self._getContext() |
---|
1924 | if url != None and len(url.strip()) != 0: |
---|
1925 | if 'url' not in context['media_thumbnail'][-1]: |
---|
1926 | context['media_thumbnail'][-1]['url'] = url |
---|
1927 | |
---|
1928 | def _start_media_player(self, attrsD): |
---|
1929 | self.push('media_player', 0) |
---|
1930 | self._getContext()['media_player'] = FeedParserDict(attrsD) |
---|
1931 | |
---|
1932 | def _end_media_player(self): |
---|
1933 | value = self.pop('media_player') |
---|
1934 | context = self._getContext() |
---|
1935 | context['media_player']['content'] = value |
---|
1936 | |
---|
1937 | def _start_newlocation(self, attrsD): |
---|
1938 | self.push('newlocation', 1) |
---|
1939 | |
---|
1940 | def _end_newlocation(self): |
---|
1941 | url = self.pop('newlocation') |
---|
1942 | context = self._getContext() |
---|
1943 | # don't set newlocation if the context isn't right |
---|
1944 | if context is not self.feeddata: |
---|
1945 | return |
---|
1946 | context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip()) |
---|
1947 | |
---|
1948 | def _start_psc_chapters(self, attrsD): |
---|
1949 | if self.psc_chapters_flag is None: |
---|
1950 | # Transition from None -> True |
---|
1951 | self.psc_chapters_flag = True |
---|
1952 | attrsD['chapters'] = [] |
---|
1953 | self._getContext()['psc_chapters'] = FeedParserDict(attrsD) |
---|
1954 | |
---|
1955 | def _end_psc_chapters(self): |
---|
1956 | # Transition from True -> False |
---|
1957 | self.psc_chapters_flag = False |
---|
1958 | |
---|
1959 | def _start_psc_chapter(self, attrsD): |
---|
1960 | if self.psc_chapters_flag: |
---|
1961 | start = self._getAttribute(attrsD, 'start') |
---|
1962 | attrsD['start_parsed'] = _parse_psc_chapter_start(start) |
---|
1963 | |
---|
1964 | context = self._getContext()['psc_chapters'] |
---|
1965 | context['chapters'].append(FeedParserDict(attrsD)) |
---|
1966 | |
---|
1967 | |
---|
1968 | if _XML_AVAILABLE: |
---|
1969 | class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): |
---|
1970 | def __init__(self, baseuri, baselang, encoding): |
---|
1971 | xml.sax.handler.ContentHandler.__init__(self) |
---|
1972 | _FeedParserMixin.__init__(self, baseuri, baselang, encoding) |
---|
1973 | self.bozo = 0 |
---|
1974 | self.exc = None |
---|
1975 | self.decls = {} |
---|
1976 | |
---|
1977 | def startPrefixMapping(self, prefix, uri): |
---|
1978 | if not uri: |
---|
1979 | return |
---|
1980 | # Jython uses '' instead of None; standardize on None |
---|
1981 | prefix = prefix or None |
---|
1982 | self.trackNamespace(prefix, uri) |
---|
1983 | if prefix and uri == 'http://www.w3.org/1999/xlink': |
---|
1984 | self.decls['xmlns:' + prefix] = uri |
---|
1985 | |
---|
1986 | def startElementNS(self, name, qname, attrs): |
---|
1987 | namespace, localname = name |
---|
1988 | lowernamespace = str(namespace or '').lower() |
---|
1989 | if lowernamespace.find(u'backend.userland.com/rss') <> -1: |
---|
1990 | # match any backend.userland.com namespace |
---|
1991 | namespace = u'http://backend.userland.com/rss' |
---|
1992 | lowernamespace = namespace |
---|
1993 | if qname and qname.find(':') > 0: |
---|
1994 | givenprefix = qname.split(':')[0] |
---|
1995 | else: |
---|
1996 | givenprefix = None |
---|
1997 | prefix = self._matchnamespaces.get(lowernamespace, givenprefix) |
---|
1998 | if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespacesInUse: |
---|
1999 | raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix |
---|
2000 | localname = str(localname).lower() |
---|
2001 | |
---|
2002 | # qname implementation is horribly broken in Python 2.1 (it |
---|
2003 | # doesn't report any), and slightly broken in Python 2.2 (it |
---|
2004 | # doesn't report the xml: namespace). So we match up namespaces |
---|
2005 | # with a known list first, and then possibly override them with |
---|
2006 | # the qnames the SAX parser gives us (if indeed it gives us any |
---|
2007 | # at all). Thanks to MatejC for helping me test this and |
---|
2008 | # tirelessly telling me that it didn't work yet. |
---|
2009 | attrsD, self.decls = self.decls, {} |
---|
2010 | if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML': |
---|
2011 | attrsD['xmlns']=namespace |
---|
2012 | if localname=='svg' and namespace=='http://www.w3.org/2000/svg': |
---|
2013 | attrsD['xmlns']=namespace |
---|
2014 | |
---|
2015 | if prefix: |
---|
2016 | localname = prefix.lower() + ':' + localname |
---|
2017 | elif namespace and not qname: #Expat |
---|
2018 | for name,value in self.namespacesInUse.items(): |
---|
2019 | if name and value == namespace: |
---|
2020 | localname = name + ':' + localname |
---|
2021 | break |
---|
2022 | |
---|
2023 | for (namespace, attrlocalname), attrvalue in attrs.items(): |
---|
2024 | lowernamespace = (namespace or '').lower() |
---|
2025 | prefix = self._matchnamespaces.get(lowernamespace, '') |
---|
2026 | if prefix: |
---|
2027 | attrlocalname = prefix + ':' + attrlocalname |
---|
2028 | attrsD[str(attrlocalname).lower()] = attrvalue |
---|
2029 | for qname in attrs.getQNames(): |
---|
2030 | attrsD[str(qname).lower()] = attrs.getValueByQName(qname) |
---|
2031 | localname = str(localname).lower() |
---|
2032 | self.unknown_starttag(localname, attrsD.items()) |
---|
2033 | |
---|
2034 | def characters(self, text): |
---|
2035 | self.handle_data(text) |
---|
2036 | |
---|
2037 | def endElementNS(self, name, qname): |
---|
2038 | namespace, localname = name |
---|
2039 | lowernamespace = str(namespace or '').lower() |
---|
2040 | if qname and qname.find(':') > 0: |
---|
2041 | givenprefix = qname.split(':')[0] |
---|
2042 | else: |
---|
2043 | givenprefix = '' |
---|
2044 | prefix = self._matchnamespaces.get(lowernamespace, givenprefix) |
---|
2045 | if prefix: |
---|
2046 | localname = prefix + ':' + localname |
---|
2047 | elif namespace and not qname: #Expat |
---|
2048 | for name,value in self.namespacesInUse.items(): |
---|
2049 | if name and value == namespace: |
---|
2050 | localname = name + ':' + localname |
---|
2051 | break |
---|
2052 | localname = str(localname).lower() |
---|
2053 | self.unknown_endtag(localname) |
---|
2054 | |
---|
2055 | def error(self, exc): |
---|
2056 | self.bozo = 1 |
---|
2057 | self.exc = exc |
---|
2058 | |
---|
2059 | # drv_libxml2 calls warning() in some cases |
---|
2060 | warning = error |
---|
2061 | |
---|
2062 | def fatalError(self, exc): |
---|
2063 | self.error(exc) |
---|
2064 | raise exc |
---|
2065 | |
---|
2066 | class _BaseHTMLProcessor(sgmllib.SGMLParser): |
---|
2067 | special = re.compile('''[<>'"]''') |
---|
2068 | bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") |
---|
2069 | elements_no_end_tag = set([ |
---|
2070 | 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', |
---|
2071 | 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', |
---|
2072 | 'source', 'track', 'wbr' |
---|
2073 | ]) |
---|
2074 | |
---|
2075 | def __init__(self, encoding, _type): |
---|
2076 | self.encoding = encoding |
---|
2077 | self._type = _type |
---|
2078 | sgmllib.SGMLParser.__init__(self) |
---|
2079 | |
---|
2080 | def reset(self): |
---|
2081 | self.pieces = [] |
---|
2082 | sgmllib.SGMLParser.reset(self) |
---|
2083 | |
---|
2084 | def _shorttag_replace(self, match): |
---|
2085 | tag = match.group(1) |
---|
2086 | if tag in self.elements_no_end_tag: |
---|
2087 | return '<' + tag + ' />' |
---|
2088 | else: |
---|
2089 | return '<' + tag + '></' + tag + '>' |
---|
2090 | |
---|
2091 | # By declaring these methods and overriding their compiled code |
---|
2092 | # with the code from sgmllib, the original code will execute in |
---|
2093 | # feedparser's scope instead of sgmllib's. This means that the |
---|
2094 | # `tagfind` and `charref` regular expressions will be found as |
---|
2095 | # they're declared above, not as they're declared in sgmllib. |
---|
2096 | def goahead(self, i): |
---|
2097 | pass |
---|
2098 | goahead.func_code = sgmllib.SGMLParser.goahead.func_code |
---|
2099 | |
---|
2100 | def __parse_starttag(self, i): |
---|
2101 | pass |
---|
2102 | __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code |
---|
2103 | |
---|
2104 | def parse_starttag(self,i): |
---|
2105 | j = self.__parse_starttag(i) |
---|
2106 | if self._type == 'application/xhtml+xml': |
---|
2107 | if j>2 and self.rawdata[j-2:j]=='/>': |
---|
2108 | self.unknown_endtag(self.lasttag) |
---|
2109 | return j |
---|
2110 | |
---|
2111 | def feed(self, data): |
---|
2112 | data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data) |
---|
2113 | data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data) |
---|
2114 | data = data.replace(''', "'") |
---|
2115 | data = data.replace('"', '"') |
---|
2116 | try: |
---|
2117 | bytes |
---|
2118 | if bytes is str: |
---|
2119 | raise NameError |
---|
2120 | self.encoding = self.encoding + u'_INVALID_PYTHON_3' |
---|
2121 | except NameError: |
---|
2122 | if self.encoding and isinstance(data, unicode): |
---|
2123 | data = data.encode(self.encoding) |
---|
2124 | sgmllib.SGMLParser.feed(self, data) |
---|
2125 | sgmllib.SGMLParser.close(self) |
---|
2126 | |
---|
2127 | def normalize_attrs(self, attrs): |
---|
2128 | if not attrs: |
---|
2129 | return attrs |
---|
2130 | # utility method to be called by descendants |
---|
2131 | attrs = dict([(k.lower(), v) for k, v in attrs]).items() |
---|
2132 | attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] |
---|
2133 | attrs.sort() |
---|
2134 | return attrs |
---|
2135 | |
---|
2136 | def unknown_starttag(self, tag, attrs): |
---|
2137 | # called for each start tag |
---|
2138 | # attrs is a list of (attr, value) tuples |
---|
2139 | # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')] |
---|
2140 | uattrs = [] |
---|
2141 | strattrs='' |
---|
2142 | if attrs: |
---|
2143 | for key, value in attrs: |
---|
2144 | value=value.replace('>','>').replace('<','<').replace('"','"') |
---|
2145 | value = self.bare_ampersand.sub("&", value) |
---|
2146 | # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds |
---|
2147 | if not isinstance(value, unicode): |
---|
2148 | value = value.decode(self.encoding, 'ignore') |
---|
2149 | try: |
---|
2150 | # Currently, in Python 3 the key is already a str, and cannot be decoded again |
---|
2151 | uattrs.append((unicode(key, self.encoding), value)) |
---|
2152 | except TypeError: |
---|
2153 | uattrs.append((key, value)) |
---|
2154 | strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]) |
---|
2155 | if self.encoding: |
---|
2156 | try: |
---|
2157 | strattrs = strattrs.encode(self.encoding) |
---|
2158 | except (UnicodeEncodeError, LookupError): |
---|
2159 | pass |
---|
2160 | if tag in self.elements_no_end_tag: |
---|
2161 | self.pieces.append('<%s%s />' % (tag, strattrs)) |
---|
2162 | else: |
---|
2163 | self.pieces.append('<%s%s>' % (tag, strattrs)) |
---|
2164 | |
---|
2165 | def unknown_endtag(self, tag): |
---|
2166 | # called for each end tag, e.g. for </pre>, tag will be 'pre' |
---|
2167 | # Reconstruct the original end tag. |
---|
2168 | if tag not in self.elements_no_end_tag: |
---|
2169 | self.pieces.append("</%s>" % tag) |
---|
2170 | |
---|
2171 | def handle_charref(self, ref): |
---|
2172 | # called for each character reference, e.g. for ' ', ref will be '160' |
---|
2173 | # Reconstruct the original character reference. |
---|
2174 | ref = ref.lower() |
---|
2175 | if ref.startswith('x'): |
---|
2176 | value = int(ref[1:], 16) |
---|
2177 | else: |
---|
2178 | value = int(ref) |
---|
2179 | |
---|
2180 | if value in _cp1252: |
---|
2181 | self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:]) |
---|
2182 | else: |
---|
2183 | self.pieces.append('&#%s;' % ref) |
---|
2184 | |
---|
2185 | def handle_entityref(self, ref): |
---|
2186 | # called for each entity reference, e.g. for '©', ref will be 'copy' |
---|
2187 | # Reconstruct the original entity reference. |
---|
2188 | if ref in name2codepoint or ref == 'apos': |
---|
2189 | self.pieces.append('&%s;' % ref) |
---|
2190 | else: |
---|
2191 | self.pieces.append('&%s' % ref) |
---|
2192 | |
---|
2193 | def handle_data(self, text): |
---|
2194 | # called for each block of plain text, i.e. outside of any tag and |
---|
2195 | # not containing any character or entity references |
---|
2196 | # Store the original text verbatim. |
---|
2197 | self.pieces.append(text) |
---|
2198 | |
---|
2199 | def handle_comment(self, text): |
---|
2200 | # called for each HTML comment, e.g. <!-- insert Javascript code here --> |
---|
2201 | # Reconstruct the original comment. |
---|
2202 | self.pieces.append('<!--%s-->' % text) |
---|
2203 | |
---|
2204 | def handle_pi(self, text): |
---|
2205 | # called for each processing instruction, e.g. <?instruction> |
---|
2206 | # Reconstruct original processing instruction. |
---|
2207 | self.pieces.append('<?%s>' % text) |
---|
2208 | |
---|
2209 | def handle_decl(self, text): |
---|
2210 | # called for the DOCTYPE, if present, e.g. |
---|
2211 | # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" |
---|
2212 | # "http://www.w3.org/TR/html4/loose.dtd"> |
---|
2213 | # Reconstruct original DOCTYPE |
---|
2214 | self.pieces.append('<!%s>' % text) |
---|
2215 | |
---|
2216 | _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match |
---|
2217 | def _scan_name(self, i, declstartpos): |
---|
2218 | rawdata = self.rawdata |
---|
2219 | n = len(rawdata) |
---|
2220 | if i == n: |
---|
2221 | return None, -1 |
---|
2222 | m = self._new_declname_match(rawdata, i) |
---|
2223 | if m: |
---|
2224 | s = m.group() |
---|
2225 | name = s.strip() |
---|
2226 | if (i + len(s)) == n: |
---|
2227 | return None, -1 # end of buffer |
---|
2228 | return name.lower(), m.end() |
---|
2229 | else: |
---|
2230 | self.handle_data(rawdata) |
---|
2231 | # self.updatepos(declstartpos, i) |
---|
2232 | return None, -1 |
---|
2233 | |
---|
2234 | def convert_charref(self, name): |
---|
2235 | return '&#%s;' % name |
---|
2236 | |
---|
2237 | def convert_entityref(self, name): |
---|
2238 | return '&%s;' % name |
---|
2239 | |
---|
2240 | def output(self): |
---|
2241 | '''Return processed HTML as a single string''' |
---|
2242 | return ''.join([str(p) for p in self.pieces]) |
---|
2243 | |
---|
2244 | def parse_declaration(self, i): |
---|
2245 | try: |
---|
2246 | return sgmllib.SGMLParser.parse_declaration(self, i) |
---|
2247 | except sgmllib.SGMLParseError: |
---|
2248 | # escape the doctype declaration and continue parsing |
---|
2249 | self.handle_data('<') |
---|
2250 | return i+1 |
---|
2251 | |
---|
2252 | class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): |
---|
2253 | def __init__(self, baseuri, baselang, encoding, entities): |
---|
2254 | sgmllib.SGMLParser.__init__(self) |
---|
2255 | _FeedParserMixin.__init__(self, baseuri, baselang, encoding) |
---|
2256 | _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml') |
---|
2257 | self.entities=entities |
---|
2258 | |
---|
2259 | def decodeEntities(self, element, data): |
---|
2260 | data = data.replace('<', '<') |
---|
2261 | data = data.replace('<', '<') |
---|
2262 | data = data.replace('<', '<') |
---|
2263 | data = data.replace('>', '>') |
---|
2264 | data = data.replace('>', '>') |
---|
2265 | data = data.replace('>', '>') |
---|
2266 | data = data.replace('&', '&') |
---|
2267 | data = data.replace('&', '&') |
---|
2268 | data = data.replace('"', '"') |
---|
2269 | data = data.replace('"', '"') |
---|
2270 | data = data.replace(''', ''') |
---|
2271 | data = data.replace(''', ''') |
---|
2272 | if not self.contentparams.get('type', u'xml').endswith(u'xml'): |
---|
2273 | data = data.replace('<', '<') |
---|
2274 | data = data.replace('>', '>') |
---|
2275 | data = data.replace('&', '&') |
---|
2276 | data = data.replace('"', '"') |
---|
2277 | data = data.replace(''', "'") |
---|
2278 | data = data.replace('/', '/') |
---|
2279 | data = data.replace('/', '/') |
---|
2280 | return data |
---|
2281 | |
---|
2282 | def strattrs(self, attrs): |
---|
2283 | return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs]) |
---|
2284 | |
---|
2285 | class _RelativeURIResolver(_BaseHTMLProcessor): |
---|
2286 | relative_uris = set([('a', 'href'), |
---|
2287 | ('applet', 'codebase'), |
---|
2288 | ('area', 'href'), |
---|
2289 | ('audio', 'src'), |
---|
2290 | ('blockquote', 'cite'), |
---|
2291 | ('body', 'background'), |
---|
2292 | ('del', 'cite'), |
---|
2293 | ('form', 'action'), |
---|
2294 | ('frame', 'longdesc'), |
---|
2295 | ('frame', 'src'), |
---|
2296 | ('iframe', 'longdesc'), |
---|
2297 | ('iframe', 'src'), |
---|
2298 | ('head', 'profile'), |
---|
2299 | ('img', 'longdesc'), |
---|
2300 | ('img', 'src'), |
---|
2301 | ('img', 'usemap'), |
---|
2302 | ('input', 'src'), |
---|
2303 | ('input', 'usemap'), |
---|
2304 | ('ins', 'cite'), |
---|
2305 | ('link', 'href'), |
---|
2306 | ('object', 'classid'), |
---|
2307 | ('object', 'codebase'), |
---|
2308 | ('object', 'data'), |
---|
2309 | ('object', 'usemap'), |
---|
2310 | ('q', 'cite'), |
---|
2311 | ('script', 'src'), |
---|
2312 | ('source', 'src'), |
---|
2313 | ('video', 'poster'), |
---|
2314 | ('video', 'src')]) |
---|
2315 | |
---|
2316 | def __init__(self, baseuri, encoding, _type): |
---|
2317 | _BaseHTMLProcessor.__init__(self, encoding, _type) |
---|
2318 | self.baseuri = baseuri |
---|
2319 | |
---|
2320 | def resolveURI(self, uri): |
---|
2321 | return _makeSafeAbsoluteURI(self.baseuri, uri.strip()) |
---|
2322 | |
---|
2323 | def unknown_starttag(self, tag, attrs): |
---|
2324 | attrs = self.normalize_attrs(attrs) |
---|
2325 | attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] |
---|
2326 | _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) |
---|
2327 | |
---|
2328 | def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type): |
---|
2329 | if not _SGML_AVAILABLE: |
---|
2330 | return htmlSource |
---|
2331 | |
---|
2332 | p = _RelativeURIResolver(baseURI, encoding, _type) |
---|
2333 | p.feed(htmlSource) |
---|
2334 | return p.output() |
---|
2335 | |
---|
2336 | def _makeSafeAbsoluteURI(base, rel=None): |
---|
2337 | # bail if ACCEPTABLE_URI_SCHEMES is empty |
---|
2338 | if not ACCEPTABLE_URI_SCHEMES: |
---|
2339 | return _urljoin(base, rel or u'') |
---|
2340 | if not base: |
---|
2341 | return rel or u'' |
---|
2342 | if not rel: |
---|
2343 | try: |
---|
2344 | scheme = urlparse.urlparse(base)[0] |
---|
2345 | except ValueError: |
---|
2346 | return u'' |
---|
2347 | if not scheme or scheme in ACCEPTABLE_URI_SCHEMES: |
---|
2348 | return base |
---|
2349 | return u'' |
---|
2350 | uri = _urljoin(base, rel) |
---|
2351 | if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES: |
---|
2352 | return u'' |
---|
2353 | return uri |
---|
2354 | |
---|
2355 | class _HTMLSanitizer(_BaseHTMLProcessor): |
---|
2356 | acceptable_elements = set(['a', 'abbr', 'acronym', 'address', 'area', |
---|
2357 | 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', |
---|
2358 | 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', |
---|
2359 | 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', |
---|
2360 | 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', |
---|
2361 | 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1', |
---|
2362 | 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', |
---|
2363 | 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', |
---|
2364 | 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', |
---|
2365 | 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', |
---|
2366 | 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', |
---|
2367 | 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', |
---|
2368 | 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']) |
---|
2369 | |
---|
2370 | acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey', |
---|
2371 | 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis', |
---|
2372 | 'background', 'balance', 'bgcolor', 'bgproperties', 'border', |
---|
2373 | 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding', |
---|
2374 | 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff', |
---|
2375 | 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols', |
---|
2376 | 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data', |
---|
2377 | 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay', |
---|
2378 | 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for', |
---|
2379 | 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus', |
---|
2380 | 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode', |
---|
2381 | 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc', |
---|
2382 | 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max', |
---|
2383 | 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref', |
---|
2384 | 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size', |
---|
2385 | 'poster', 'pqg', 'preload', 'prompt', 'radiogroup', 'readonly', 'rel', |
---|
2386 | 'repeat-max', 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', |
---|
2387 | 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', |
---|
2388 | 'src', 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', |
---|
2389 | 'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap', |
---|
2390 | 'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml', |
---|
2391 | 'width', 'wrap', 'xml:lang']) |
---|
2392 | |
---|
2393 | unacceptable_elements_with_end_tag = set(['script', 'applet', 'style']) |
---|
2394 | |
---|
2395 | acceptable_css_properties = set(['azimuth', 'background-color', |
---|
2396 | 'border-bottom-color', 'border-collapse', 'border-color', |
---|
2397 | 'border-left-color', 'border-right-color', 'border-top-color', 'clear', |
---|
2398 | 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', |
---|
2399 | 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', |
---|
2400 | 'height', 'letter-spacing', 'line-height', 'overflow', 'pause', |
---|
2401 | 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness', |
---|
2402 | 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', |
---|
2403 | 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', |
---|
2404 | 'unicode-bidi', 'vertical-align', 'voice-family', 'volume', |
---|
2405 | 'white-space', 'width']) |
---|
2406 | |
---|
2407 | # survey of common keywords found in feeds |
---|
2408 | acceptable_css_keywords = set(['auto', 'aqua', 'black', 'block', 'blue', |
---|
2409 | 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', |
---|
2410 | 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', |
---|
2411 | 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', |
---|
2412 | 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', |
---|
2413 | 'transparent', 'underline', 'white', 'yellow']) |
---|
2414 | |
---|
2415 | valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' + |
---|
2416 | '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$') |
---|
2417 | |
---|
2418 | mathml_elements = set([ |
---|
2419 | 'annotation', |
---|
2420 | 'annotation-xml', |
---|
2421 | 'maction', |
---|
2422 | 'maligngroup', |
---|
2423 | 'malignmark', |
---|
2424 | 'math', |
---|
2425 | 'menclose', |
---|
2426 | 'merror', |
---|
2427 | 'mfenced', |
---|
2428 | 'mfrac', |
---|
2429 | 'mglyph', |
---|
2430 | 'mi', |
---|
2431 | 'mlabeledtr', |
---|
2432 | 'mlongdiv', |
---|
2433 | 'mmultiscripts', |
---|
2434 | 'mn', |
---|
2435 | 'mo', |
---|
2436 | 'mover', |
---|
2437 | 'mpadded', |
---|
2438 | 'mphantom', |
---|
2439 | 'mprescripts', |
---|
2440 | 'mroot', |
---|
2441 | 'mrow', |
---|
2442 | 'ms', |
---|
2443 | 'mscarries', |
---|
2444 | 'mscarry', |
---|
2445 | 'msgroup', |
---|
2446 | 'msline', |
---|
2447 | 'mspace', |
---|
2448 | 'msqrt', |
---|
2449 | 'msrow', |
---|
2450 | 'mstack', |
---|
2451 | 'mstyle', |
---|
2452 | 'msub', |
---|
2453 | 'msubsup', |
---|
2454 | 'msup', |
---|
2455 | 'mtable', |
---|
2456 | 'mtd', |
---|
2457 | 'mtext', |
---|
2458 | 'mtr', |
---|
2459 | 'munder', |
---|
2460 | 'munderover', |
---|
2461 | 'none', |
---|
2462 | 'semantics', |
---|
2463 | ]) |
---|
2464 | |
---|
2465 | mathml_attributes = set([ |
---|
2466 | 'accent', |
---|
2467 | 'accentunder', |
---|
2468 | 'actiontype', |
---|
2469 | 'align', |
---|
2470 | 'alignmentscope', |
---|
2471 | 'altimg', |
---|
2472 | 'altimg-height', |
---|
2473 | 'altimg-valign', |
---|
2474 | 'altimg-width', |
---|
2475 | 'alttext', |
---|
2476 | 'bevelled', |
---|
2477 | 'charalign', |
---|
2478 | 'close', |
---|
2479 | 'columnalign', |
---|
2480 | 'columnlines', |
---|
2481 | 'columnspacing', |
---|
2482 | 'columnspan', |
---|
2483 | 'columnwidth', |
---|
2484 | 'crossout', |
---|
2485 | 'decimalpoint', |
---|
2486 | 'denomalign', |
---|
2487 | 'depth', |
---|
2488 | 'dir', |
---|
2489 | 'display', |
---|
2490 | 'displaystyle', |
---|
2491 | 'edge', |
---|
2492 | 'encoding', |
---|
2493 | 'equalcolumns', |
---|
2494 | 'equalrows', |
---|
2495 | 'fence', |
---|
2496 | 'fontstyle', |
---|
2497 | 'fontweight', |
---|
2498 | 'form', |
---|
2499 | 'frame', |
---|
2500 | 'framespacing', |
---|
2501 | 'groupalign', |
---|
2502 | 'height', |
---|
2503 | 'href', |
---|
2504 | 'id', |
---|
2505 | 'indentalign', |
---|
2506 | 'indentalignfirst', |
---|
2507 | 'indentalignlast', |
---|
2508 | 'indentshift', |
---|
2509 | 'indentshiftfirst', |
---|
2510 | 'indentshiftlast', |
---|
2511 | 'indenttarget', |
---|
2512 | 'infixlinebreakstyle', |
---|
2513 | 'largeop', |
---|
2514 | 'length', |
---|
2515 | 'linebreak', |
---|
2516 | 'linebreakmultchar', |
---|
2517 | 'linebreakstyle', |
---|
2518 | 'lineleading', |
---|
2519 | 'linethickness', |
---|
2520 | 'location', |
---|
2521 | 'longdivstyle', |
---|
2522 | 'lquote', |
---|
2523 | 'lspace', |
---|
2524 | 'mathbackground', |
---|
2525 | 'mathcolor', |
---|
2526 | 'mathsize', |
---|
2527 | 'mathvariant', |
---|
2528 | 'maxsize', |
---|
2529 | 'minlabelspacing', |
---|
2530 | 'minsize', |
---|
2531 | 'movablelimits', |
---|
2532 | 'notation', |
---|
2533 | 'numalign', |
---|
2534 | 'open', |
---|
2535 | 'other', |
---|
2536 | 'overflow', |
---|
2537 | 'position', |
---|
2538 | 'rowalign', |
---|
2539 | 'rowlines', |
---|
2540 | 'rowspacing', |
---|
2541 | 'rowspan', |
---|
2542 | 'rquote', |
---|
2543 | 'rspace', |
---|
2544 | 'scriptlevel', |
---|
2545 | 'scriptminsize', |
---|
2546 | 'scriptsizemultiplier', |
---|
2547 | 'selection', |
---|
2548 | 'separator', |
---|
2549 | 'separators', |
---|
2550 | 'shift', |
---|
2551 | 'side', |
---|
2552 | 'src', |
---|
2553 | 'stackalign', |
---|
2554 | 'stretchy', |
---|
2555 | 'subscriptshift', |
---|
2556 | 'superscriptshift', |
---|
2557 | 'symmetric', |
---|
2558 | 'voffset', |
---|
2559 | 'width', |
---|
2560 | 'xlink:href', |
---|
2561 | 'xlink:show', |
---|
2562 | 'xlink:type', |
---|
2563 | 'xmlns', |
---|
2564 | 'xmlns:xlink', |
---|
2565 | ]) |
---|
2566 | |
---|
2567 | # svgtiny - foreignObject + linearGradient + radialGradient + stop |
---|
2568 | svg_elements = set(['a', 'animate', 'animateColor', 'animateMotion', |
---|
2569 | 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject', |
---|
2570 | 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', |
---|
2571 | 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath', |
---|
2572 | 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', |
---|
2573 | 'svg', 'switch', 'text', 'title', 'tspan', 'use']) |
---|
2574 | |
---|
2575 | # svgtiny + class + opacity + offset + xmlns + xmlns:xlink |
---|
2576 | svg_attributes = set(['accent-height', 'accumulate', 'additive', 'alphabetic', |
---|
2577 | 'arabic-form', 'ascent', 'attributeName', 'attributeType', |
---|
2578 | 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', |
---|
2579 | 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx', |
---|
2580 | 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity', |
---|
2581 | 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style', |
---|
2582 | 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', |
---|
2583 | 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', |
---|
2584 | 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', |
---|
2585 | 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid', |
---|
2586 | 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max', |
---|
2587 | 'min', 'name', 'offset', 'opacity', 'orient', 'origin', |
---|
2588 | 'overline-position', 'overline-thickness', 'panose-1', 'path', |
---|
2589 | 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY', |
---|
2590 | 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures', |
---|
2591 | 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', |
---|
2592 | 'stop-color', 'stop-opacity', 'strikethrough-position', |
---|
2593 | 'strikethrough-thickness', 'stroke', 'stroke-dasharray', |
---|
2594 | 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin', |
---|
2595 | 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage', |
---|
2596 | 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2', |
---|
2597 | 'underline-position', 'underline-thickness', 'unicode', 'unicode-range', |
---|
2598 | 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width', |
---|
2599 | 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole', |
---|
2600 | 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type', |
---|
2601 | 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', |
---|
2602 | 'y2', 'zoomAndPan']) |
---|
2603 | |
---|
2604 | svg_attr_map = None |
---|
2605 | svg_elem_map = None |
---|
2606 | |
---|
2607 | acceptable_svg_properties = set([ 'fill', 'fill-opacity', 'fill-rule', |
---|
2608 | 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', |
---|
2609 | 'stroke-opacity']) |
---|
2610 | |
---|
2611 | def reset(self): |
---|
2612 | _BaseHTMLProcessor.reset(self) |
---|
2613 | self.unacceptablestack = 0 |
---|
2614 | self.mathmlOK = 0 |
---|
2615 | self.svgOK = 0 |
---|
2616 | |
---|
2617 | def unknown_starttag(self, tag, attrs): |
---|
2618 | acceptable_attributes = self.acceptable_attributes |
---|
2619 | keymap = {} |
---|
2620 | if not tag in self.acceptable_elements or self.svgOK: |
---|
2621 | if tag in self.unacceptable_elements_with_end_tag: |
---|
2622 | self.unacceptablestack += 1 |
---|
2623 | |
---|
2624 | # add implicit namespaces to html5 inline svg/mathml |
---|
2625 | if self._type.endswith('html'): |
---|
2626 | if not dict(attrs).get('xmlns'): |
---|
2627 | if tag=='svg': |
---|
2628 | attrs.append( ('xmlns','http://www.w3.org/2000/svg') ) |
---|
2629 | if tag=='math': |
---|
2630 | attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') ) |
---|
2631 | |
---|
2632 | # not otherwise acceptable, perhaps it is MathML or SVG? |
---|
2633 | if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs: |
---|
2634 | self.mathmlOK += 1 |
---|
2635 | if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs: |
---|
2636 | self.svgOK += 1 |
---|
2637 | |
---|
2638 | # chose acceptable attributes based on tag class, else bail |
---|
2639 | if self.mathmlOK and tag in self.mathml_elements: |
---|
2640 | acceptable_attributes = self.mathml_attributes |
---|
2641 | elif self.svgOK and tag in self.svg_elements: |
---|
2642 | # for most vocabularies, lowercasing is a good idea. Many |
---|
2643 | # svg elements, however, are camel case |
---|
2644 | if not self.svg_attr_map: |
---|
2645 | lower=[attr.lower() for attr in self.svg_attributes] |
---|
2646 | mix=[a for a in self.svg_attributes if a not in lower] |
---|
2647 | self.svg_attributes = lower |
---|
2648 | self.svg_attr_map = dict([(a.lower(),a) for a in mix]) |
---|
2649 | |
---|
2650 | lower=[attr.lower() for attr in self.svg_elements] |
---|
2651 | mix=[a for a in self.svg_elements if a not in lower] |
---|
2652 | self.svg_elements = lower |
---|
2653 | self.svg_elem_map = dict([(a.lower(),a) for a in mix]) |
---|
2654 | acceptable_attributes = self.svg_attributes |
---|
2655 | tag = self.svg_elem_map.get(tag,tag) |
---|
2656 | keymap = self.svg_attr_map |
---|
2657 | elif not tag in self.acceptable_elements: |
---|
2658 | return |
---|
2659 | |
---|
2660 | # declare xlink namespace, if needed |
---|
2661 | if self.mathmlOK or self.svgOK: |
---|
2662 | if filter(lambda (n,v): n.startswith('xlink:'),attrs): |
---|
2663 | if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs: |
---|
2664 | attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink')) |
---|
2665 | |
---|
2666 | clean_attrs = [] |
---|
2667 | for key, value in self.normalize_attrs(attrs): |
---|
2668 | if key in acceptable_attributes: |
---|
2669 | key=keymap.get(key,key) |
---|
2670 | # make sure the uri uses an acceptable uri scheme |
---|
2671 | if key == u'href': |
---|
2672 | value = _makeSafeAbsoluteURI(value) |
---|
2673 | clean_attrs.append((key,value)) |
---|
2674 | elif key=='style': |
---|
2675 | clean_value = self.sanitize_style(value) |
---|
2676 | if clean_value: |
---|
2677 | clean_attrs.append((key,clean_value)) |
---|
2678 | _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs) |
---|
2679 | |
---|
2680 | def unknown_endtag(self, tag): |
---|
2681 | if not tag in self.acceptable_elements: |
---|
2682 | if tag in self.unacceptable_elements_with_end_tag: |
---|
2683 | self.unacceptablestack -= 1 |
---|
2684 | if self.mathmlOK and tag in self.mathml_elements: |
---|
2685 | if tag == 'math' and self.mathmlOK: |
---|
2686 | self.mathmlOK -= 1 |
---|
2687 | elif self.svgOK and tag in self.svg_elements: |
---|
2688 | tag = self.svg_elem_map.get(tag,tag) |
---|
2689 | if tag == 'svg' and self.svgOK: |
---|
2690 | self.svgOK -= 1 |
---|
2691 | else: |
---|
2692 | return |
---|
2693 | _BaseHTMLProcessor.unknown_endtag(self, tag) |
---|
2694 | |
---|
2695 | def handle_pi(self, text): |
---|
2696 | pass |
---|
2697 | |
---|
2698 | def handle_decl(self, text): |
---|
2699 | pass |
---|
2700 | |
---|
2701 | def handle_data(self, text): |
---|
2702 | if not self.unacceptablestack: |
---|
2703 | _BaseHTMLProcessor.handle_data(self, text) |
---|
2704 | |
---|
2705 | def sanitize_style(self, style): |
---|
2706 | # disallow urls |
---|
2707 | style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style) |
---|
2708 | |
---|
2709 | # gauntlet |
---|
2710 | if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): |
---|
2711 | return '' |
---|
2712 | # This replaced a regexp that used re.match and was prone to pathological back-tracking. |
---|
2713 | if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): |
---|
2714 | return '' |
---|
2715 | |
---|
2716 | clean = [] |
---|
2717 | for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style): |
---|
2718 | if not value: |
---|
2719 | continue |
---|
2720 | if prop.lower() in self.acceptable_css_properties: |
---|
2721 | clean.append(prop + ': ' + value + ';') |
---|
2722 | elif prop.split('-')[0].lower() in ['background','border','margin','padding']: |
---|
2723 | for keyword in value.split(): |
---|
2724 | if not keyword in self.acceptable_css_keywords and \ |
---|
2725 | not self.valid_css_values.match(keyword): |
---|
2726 | break |
---|
2727 | else: |
---|
2728 | clean.append(prop + ': ' + value + ';') |
---|
2729 | elif self.svgOK and prop.lower() in self.acceptable_svg_properties: |
---|
2730 | clean.append(prop + ': ' + value + ';') |
---|
2731 | |
---|
2732 | return ' '.join(clean) |
---|
2733 | |
---|
2734 | def parse_comment(self, i, report=1): |
---|
2735 | ret = _BaseHTMLProcessor.parse_comment(self, i, report) |
---|
2736 | if ret >= 0: |
---|
2737 | return ret |
---|
2738 | # if ret == -1, this may be a malicious attempt to circumvent |
---|
2739 | # sanitization, or a page-destroying unclosed comment |
---|
2740 | match = re.compile(r'--[^>]*>').search(self.rawdata, i+4) |
---|
2741 | if match: |
---|
2742 | return match.end() |
---|
2743 | # unclosed comment; deliberately fail to handle_data() |
---|
2744 | return len(self.rawdata) |
---|
2745 | |
---|
2746 | |
---|
2747 | def _sanitizeHTML(htmlSource, encoding, _type): |
---|
2748 | if not _SGML_AVAILABLE: |
---|
2749 | return htmlSource |
---|
2750 | p = _HTMLSanitizer(encoding, _type) |
---|
2751 | htmlSource = htmlSource.replace('<![CDATA[', '<![CDATA[') |
---|
2752 | p.feed(htmlSource) |
---|
2753 | data = p.output() |
---|
2754 | data = data.strip().replace('\r\n', '\n') |
---|
2755 | return data |
---|
2756 | |
---|
2757 | class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler): |
---|
2758 | def http_error_default(self, req, fp, code, msg, headers): |
---|
2759 | # The default implementation just raises HTTPError. |
---|
2760 | # Forget that. |
---|
2761 | fp.status = code |
---|
2762 | return fp |
---|
2763 | |
---|
2764 | def http_error_301(self, req, fp, code, msg, hdrs): |
---|
2765 | result = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, |
---|
2766 | code, msg, hdrs) |
---|
2767 | result.status = code |
---|
2768 | result.newurl = result.geturl() |
---|
2769 | return result |
---|
2770 | # The default implementations in urllib2.HTTPRedirectHandler |
---|
2771 | # are identical, so hardcoding a http_error_301 call above |
---|
2772 | # won't affect anything |
---|
2773 | http_error_300 = http_error_301 |
---|
2774 | http_error_302 = http_error_301 |
---|
2775 | http_error_303 = http_error_301 |
---|
2776 | http_error_307 = http_error_301 |
---|
2777 | |
---|
2778 | def http_error_401(self, req, fp, code, msg, headers): |
---|
2779 | # Check if |
---|
2780 | # - server requires digest auth, AND |
---|
2781 | # - we tried (unsuccessfully) with basic auth, AND |
---|
2782 | # If all conditions hold, parse authentication information |
---|
2783 | # out of the Authorization header we sent the first time |
---|
2784 | # (for the username and password) and the WWW-Authenticate |
---|
2785 | # header the server sent back (for the realm) and retry |
---|
2786 | # the request with the appropriate digest auth headers instead. |
---|
2787 | # This evil genius hack has been brought to you by Aaron Swartz. |
---|
2788 | host = urlparse.urlparse(req.get_full_url())[1] |
---|
2789 | if base64 is None or 'Authorization' not in req.headers \ |
---|
2790 | or 'WWW-Authenticate' not in headers: |
---|
2791 | return self.http_error_default(req, fp, code, msg, headers) |
---|
2792 | auth = _base64decode(req.headers['Authorization'].split(' ')[1]) |
---|
2793 | user, passw = auth.split(':') |
---|
2794 | realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] |
---|
2795 | self.add_password(realm, host, user, passw) |
---|
2796 | retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) |
---|
2797 | self.reset_retry_count() |
---|
2798 | return retry |
---|
2799 | |
---|
2800 | def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers): |
---|
2801 | """URL, filename, or string --> stream |
---|
2802 | |
---|
2803 | This function lets you define parsers that take any input source |
---|
2804 | (URL, pathname to local or network file, or actual data as a string) |
---|
2805 | and deal with it in a uniform manner. Returned object is guaranteed |
---|
2806 | to have all the basic stdio read methods (read, readline, readlines). |
---|
2807 | Just .close() the object when you're done with it. |
---|
2808 | |
---|
2809 | If the etag argument is supplied, it will be used as the value of an |
---|
2810 | If-None-Match request header. |
---|
2811 | |
---|
2812 | If the modified argument is supplied, it can be a tuple of 9 integers |
---|
2813 | (as returned by gmtime() in the standard Python time module) or a date |
---|
2814 | string in any format supported by feedparser. Regardless, it MUST |
---|
2815 | be in GMT (Greenwich Mean Time). It will be reformatted into an |
---|
2816 | RFC 1123-compliant date and used as the value of an If-Modified-Since |
---|
2817 | request header. |
---|
2818 | |
---|
2819 | If the agent argument is supplied, it will be used as the value of a |
---|
2820 | User-Agent request header. |
---|
2821 | |
---|
2822 | If the referrer argument is supplied, it will be used as the value of a |
---|
2823 | Referer[sic] request header. |
---|
2824 | |
---|
2825 | If handlers is supplied, it is a list of handlers used to build a |
---|
2826 | urllib2 opener. |
---|
2827 | |
---|
2828 | if request_headers is supplied it is a dictionary of HTTP request headers |
---|
2829 | that will override the values generated by FeedParser. |
---|
2830 | |
---|
2831 | :return: A :class:`StringIO.StringIO` or :class:`io.BytesIO`. |
---|
2832 | """ |
---|
2833 | |
---|
2834 | if hasattr(url_file_stream_or_string, 'read'): |
---|
2835 | return url_file_stream_or_string |
---|
2836 | |
---|
2837 | if isinstance(url_file_stream_or_string, basestring) \ |
---|
2838 | and urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'): |
---|
2839 | # Deal with the feed URI scheme |
---|
2840 | if url_file_stream_or_string.startswith('feed:http'): |
---|
2841 | url_file_stream_or_string = url_file_stream_or_string[5:] |
---|
2842 | elif url_file_stream_or_string.startswith('feed:'): |
---|
2843 | url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:] |
---|
2844 | if not agent: |
---|
2845 | agent = USER_AGENT |
---|
2846 | # Test for inline user:password credentials for HTTP basic auth |
---|
2847 | auth = None |
---|
2848 | if base64 and not url_file_stream_or_string.startswith('ftp:'): |
---|
2849 | urltype, rest = urllib.splittype(url_file_stream_or_string) |
---|
2850 | realhost, rest = urllib.splithost(rest) |
---|
2851 | if realhost: |
---|
2852 | user_passwd, realhost = urllib.splituser(realhost) |
---|
2853 | if user_passwd: |
---|
2854 | url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) |
---|
2855 | auth = base64.standard_b64encode(user_passwd).strip() |
---|
2856 | |
---|
2857 | # iri support |
---|
2858 | if isinstance(url_file_stream_or_string, unicode): |
---|
2859 | url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string) |
---|
2860 | |
---|
2861 | # try to open with urllib2 (to use optional headers) |
---|
2862 | request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers) |
---|
2863 | opener = urllib2.build_opener(*tuple(handlers + [_FeedURLHandler()])) |
---|
2864 | opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent |
---|
2865 | try: |
---|
2866 | return opener.open(request) |
---|
2867 | finally: |
---|
2868 | opener.close() # JohnD |
---|
2869 | |
---|
2870 | # try to open with native open function (if url_file_stream_or_string is a filename) |
---|
2871 | try: |
---|
2872 | return open(url_file_stream_or_string, 'rb') |
---|
2873 | except (IOError, UnicodeEncodeError, TypeError): |
---|
2874 | # if url_file_stream_or_string is a unicode object that |
---|
2875 | # cannot be converted to the encoding returned by |
---|
2876 | # sys.getfilesystemencoding(), a UnicodeEncodeError |
---|
2877 | # will be thrown |
---|
2878 | # If url_file_stream_or_string is a string that contains NULL |
---|
2879 | # (such as an XML document encoded in UTF-32), TypeError will |
---|
2880 | # be thrown. |
---|
2881 | pass |
---|
2882 | |
---|
2883 | # treat url_file_stream_or_string as string |
---|
2884 | if isinstance(url_file_stream_or_string, unicode): |
---|
2885 | return _StringIO(url_file_stream_or_string.encode('utf-8')) |
---|
2886 | return _StringIO(url_file_stream_or_string) |
---|
2887 | |
---|
2888 | def _convert_to_idn(url): |
---|
2889 | """Convert a URL to IDN notation""" |
---|
2890 | # this function should only be called with a unicode string |
---|
2891 | # strategy: if the host cannot be encoded in ascii, then |
---|
2892 | # it'll be necessary to encode it in idn form |
---|
2893 | parts = list(urlparse.urlsplit(url)) |
---|
2894 | try: |
---|
2895 | parts[1].encode('ascii') |
---|
2896 | except UnicodeEncodeError: |
---|
2897 | # the url needs to be converted to idn notation |
---|
2898 | host = parts[1].rsplit(':', 1) |
---|
2899 | newhost = [] |
---|
2900 | port = u'' |
---|
2901 | if len(host) == 2: |
---|
2902 | port = host.pop() |
---|
2903 | for h in host[0].split('.'): |
---|
2904 | newhost.append(h.encode('idna').decode('utf-8')) |
---|
2905 | parts[1] = '.'.join(newhost) |
---|
2906 | if port: |
---|
2907 | parts[1] += ':' + port |
---|
2908 | return urlparse.urlunsplit(parts) |
---|
2909 | else: |
---|
2910 | return url |
---|
2911 | |
---|
2912 | def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers): |
---|
2913 | request = urllib2.Request(url) |
---|
2914 | request.add_header('User-Agent', agent) |
---|
2915 | if etag: |
---|
2916 | request.add_header('If-None-Match', etag) |
---|
2917 | if isinstance(modified, basestring): |
---|
2918 | modified = _parse_date(modified) |
---|
2919 | elif isinstance(modified, datetime.datetime): |
---|
2920 | modified = modified.utctimetuple() |
---|
2921 | if modified: |
---|
2922 | # format into an RFC 1123-compliant timestamp. We can't use |
---|
2923 | # time.strftime() since the %a and %b directives can be affected |
---|
2924 | # by the current locale, but RFC 2616 states that dates must be |
---|
2925 | # in English. |
---|
2926 | short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] |
---|
2927 | months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] |
---|
2928 | request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) |
---|
2929 | if referrer: |
---|
2930 | request.add_header('Referer', referrer) |
---|
2931 | if gzip and zlib: |
---|
2932 | request.add_header('Accept-encoding', 'gzip, deflate') |
---|
2933 | elif gzip: |
---|
2934 | request.add_header('Accept-encoding', 'gzip') |
---|
2935 | elif zlib: |
---|
2936 | request.add_header('Accept-encoding', 'deflate') |
---|
2937 | else: |
---|
2938 | request.add_header('Accept-encoding', '') |
---|
2939 | if auth: |
---|
2940 | request.add_header('Authorization', 'Basic %s' % auth) |
---|
2941 | if ACCEPT_HEADER: |
---|
2942 | request.add_header('Accept', ACCEPT_HEADER) |
---|
2943 | # use this for whatever -- cookies, special headers, etc |
---|
2944 | # [('Cookie','Something'),('x-special-header','Another Value')] |
---|
2945 | for header_name, header_value in request_headers.items(): |
---|
2946 | request.add_header(header_name, header_value) |
---|
2947 | request.add_header('A-IM', 'feed') # RFC 3229 support |
---|
2948 | return request |
---|
2949 | |
---|
2950 | def _parse_psc_chapter_start(start): |
---|
2951 | FORMAT = r'^((\d{2}):)?(\d{2}):(\d{2})(\.(\d{3}))?$' |
---|
2952 | |
---|
2953 | m = re.compile(FORMAT).match(start) |
---|
2954 | if m is None: |
---|
2955 | return None |
---|
2956 | |
---|
2957 | _, h, m, s, _, ms = m.groups() |
---|
2958 | h, m, s, ms = (int(h or 0), int(m), int(s), int(ms or 0)) |
---|
2959 | return datetime.timedelta(0, h*60*60 + m*60 + s, ms*1000) |
---|
2960 | |
---|
2961 | _date_handlers = [] |
---|
2962 | def registerDateHandler(func): |
---|
2963 | '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' |
---|
2964 | _date_handlers.insert(0, func) |
---|
2965 | |
---|
2966 | # ISO-8601 date parsing routines written by Fazal Majid. |
---|
2967 | # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 |
---|
2968 | # parser is beyond the scope of feedparser and would be a worthwhile addition |
---|
2969 | # to the Python library. |
---|
2970 | # A single regular expression cannot parse ISO 8601 date formats into groups |
---|
2971 | # as the standard is highly irregular (for instance is 030104 2003-01-04 or |
---|
2972 | # 0301-04-01), so we use templates instead. |
---|
2973 | # Please note the order in templates is significant because we need a |
---|
2974 | # greedy match. |
---|
2975 | _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO', |
---|
2976 | 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', |
---|
2977 | '-YY-?MM', '-OOO', '-YY', |
---|
2978 | '--MM-?DD', '--MM', |
---|
2979 | '---DD', |
---|
2980 | 'CC', ''] |
---|
2981 | _iso8601_re = [ |
---|
2982 | tmpl.replace( |
---|
2983 | 'YYYY', r'(?P<year>\d{4})').replace( |
---|
2984 | 'YY', r'(?P<year>\d\d)').replace( |
---|
2985 | 'MM', r'(?P<month>[01]\d)').replace( |
---|
2986 | 'DD', r'(?P<day>[0123]\d)').replace( |
---|
2987 | 'OOO', r'(?P<ordinal>[0123]\d\d)').replace( |
---|
2988 | 'CC', r'(?P<century>\d\d$)') |
---|
2989 | + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})' |
---|
2990 | + r'(:(?P<second>\d{2}))?' |
---|
2991 | + r'(\.(?P<fracsecond>\d+))?' |
---|
2992 | + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?' |
---|
2993 | for tmpl in _iso8601_tmpl] |
---|
2994 | try: |
---|
2995 | del tmpl |
---|
2996 | except NameError: |
---|
2997 | pass |
---|
2998 | _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] |
---|
2999 | try: |
---|
3000 | del regex |
---|
3001 | except NameError: |
---|
3002 | pass |
---|
3003 | |
---|
3004 | def _parse_date_iso8601(dateString): |
---|
3005 | '''Parse a variety of ISO-8601-compatible formats like 20040105''' |
---|
3006 | m = None |
---|
3007 | for _iso8601_match in _iso8601_matches: |
---|
3008 | m = _iso8601_match(dateString) |
---|
3009 | if m: |
---|
3010 | break |
---|
3011 | if not m: |
---|
3012 | return |
---|
3013 | if m.span() == (0, 0): |
---|
3014 | return |
---|
3015 | params = m.groupdict() |
---|
3016 | ordinal = params.get('ordinal', 0) |
---|
3017 | if ordinal: |
---|
3018 | ordinal = int(ordinal) |
---|
3019 | else: |
---|
3020 | ordinal = 0 |
---|
3021 | year = params.get('year', '--') |
---|
3022 | if not year or year == '--': |
---|
3023 | year = time.gmtime()[0] |
---|
3024 | elif len(year) == 2: |
---|
3025 | # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 |
---|
3026 | year = 100 * int(time.gmtime()[0] / 100) + int(year) |
---|
3027 | else: |
---|
3028 | year = int(year) |
---|
3029 | month = params.get('month', '-') |
---|
3030 | if not month or month == '-': |
---|
3031 | # ordinals are NOT normalized by mktime, we simulate them |
---|
3032 | # by setting month=1, day=ordinal |
---|
3033 | if ordinal: |
---|
3034 | month = 1 |
---|
3035 | else: |
---|
3036 | month = time.gmtime()[1] |
---|
3037 | month = int(month) |
---|
3038 | day = params.get('day', 0) |
---|
3039 | if not day: |
---|
3040 | # see above |
---|
3041 | if ordinal: |
---|
3042 | day = ordinal |
---|
3043 | elif params.get('century', 0) or \ |
---|
3044 | params.get('year', 0) or params.get('month', 0): |
---|
3045 | day = 1 |
---|
3046 | else: |
---|
3047 | day = time.gmtime()[2] |
---|
3048 | else: |
---|
3049 | day = int(day) |
---|
3050 | # special case of the century - is the first year of the 21st century |
---|
3051 | # 2000 or 2001 ? The debate goes on... |
---|
3052 | if 'century' in params: |
---|
3053 | year = (int(params['century']) - 1) * 100 + 1 |
---|
3054 | # in ISO 8601 most fields are optional |
---|
3055 | for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: |
---|
3056 | if not params.get(field, None): |
---|
3057 | params[field] = 0 |
---|
3058 | hour = int(params.get('hour', 0)) |
---|
3059 | minute = int(params.get('minute', 0)) |
---|
3060 | second = int(float(params.get('second', 0))) |
---|
3061 | # weekday is normalized by mktime(), we can ignore it |
---|
3062 | weekday = 0 |
---|
3063 | daylight_savings_flag = -1 |
---|
3064 | tm = [year, month, day, hour, minute, second, weekday, |
---|
3065 | ordinal, daylight_savings_flag] |
---|
3066 | # ISO 8601 time zone adjustments |
---|
3067 | tz = params.get('tz') |
---|
3068 | if tz and tz != 'Z': |
---|
3069 | if tz[0] == '-': |
---|
3070 | tm[3] += int(params.get('tzhour', 0)) |
---|
3071 | tm[4] += int(params.get('tzmin', 0)) |
---|
3072 | elif tz[0] == '+': |
---|
3073 | tm[3] -= int(params.get('tzhour', 0)) |
---|
3074 | tm[4] -= int(params.get('tzmin', 0)) |
---|
3075 | else: |
---|
3076 | return None |
---|
3077 | # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) |
---|
3078 | # which is guaranteed to normalize d/m/y/h/m/s. |
---|
3079 | # Many implementations have bugs, but we'll pretend they don't. |
---|
3080 | return time.localtime(time.mktime(tuple(tm))) |
---|
3081 | registerDateHandler(_parse_date_iso8601) |
---|
3082 | |
---|
3083 | # 8-bit date handling routines written by ytrewq1. |
---|
3084 | _korean_year = u'\ub144' # b3e2 in euc-kr |
---|
3085 | _korean_month = u'\uc6d4' # bff9 in euc-kr |
---|
3086 | _korean_day = u'\uc77c' # c0cf in euc-kr |
---|
3087 | _korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr |
---|
3088 | _korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr |
---|
3089 | |
---|
3090 | _korean_onblog_date_re = \ |
---|
3091 | re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \ |
---|
3092 | (_korean_year, _korean_month, _korean_day)) |
---|
3093 | _korean_nate_date_re = \ |
---|
3094 | re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ |
---|
3095 | (_korean_am, _korean_pm)) |
---|
3096 | def _parse_date_onblog(dateString): |
---|
3097 | '''Parse a string according to the OnBlog 8-bit date format''' |
---|
3098 | m = _korean_onblog_date_re.match(dateString) |
---|
3099 | if not m: |
---|
3100 | return |
---|
3101 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
---|
3102 | {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ |
---|
3103 | 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ |
---|
3104 | 'zonediff': '+09:00'} |
---|
3105 | return _parse_date_w3dtf(w3dtfdate) |
---|
3106 | registerDateHandler(_parse_date_onblog) |
---|
3107 | |
---|
3108 | def _parse_date_nate(dateString): |
---|
3109 | '''Parse a string according to the Nate 8-bit date format''' |
---|
3110 | m = _korean_nate_date_re.match(dateString) |
---|
3111 | if not m: |
---|
3112 | return |
---|
3113 | hour = int(m.group(5)) |
---|
3114 | ampm = m.group(4) |
---|
3115 | if (ampm == _korean_pm): |
---|
3116 | hour += 12 |
---|
3117 | hour = str(hour) |
---|
3118 | if len(hour) == 1: |
---|
3119 | hour = '0' + hour |
---|
3120 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
---|
3121 | {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ |
---|
3122 | 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\ |
---|
3123 | 'zonediff': '+09:00'} |
---|
3124 | return _parse_date_w3dtf(w3dtfdate) |
---|
3125 | registerDateHandler(_parse_date_nate) |
---|
3126 | |
---|
3127 | # Unicode strings for Greek date strings |
---|
3128 | _greek_months = \ |
---|
3129 | { \ |
---|
3130 | u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7 |
---|
3131 | u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7 |
---|
3132 | u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7 |
---|
3133 | u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7 |
---|
3134 | u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7 |
---|
3135 | u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7 |
---|
3136 | u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7 |
---|
3137 | u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7 |
---|
3138 | u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7 |
---|
3139 | u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7 |
---|
3140 | u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7 |
---|
3141 | u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7 |
---|
3142 | u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7 |
---|
3143 | u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7 |
---|
3144 | u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7 |
---|
3145 | u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7 |
---|
3146 | u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7 |
---|
3147 | u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7 |
---|
3148 | u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7 |
---|
3149 | } |
---|
3150 | |
---|
3151 | _greek_wdays = \ |
---|
3152 | { \ |
---|
3153 | u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7 |
---|
3154 | u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7 |
---|
3155 | u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7 |
---|
3156 | u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7 |
---|
3157 | u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7 |
---|
3158 | u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7 |
---|
3159 | u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 |
---|
3160 | } |
---|
3161 | |
---|
3162 | _greek_date_format_re = \ |
---|
3163 | re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') |
---|
3164 | |
---|
3165 | def _parse_date_greek(dateString): |
---|
3166 | '''Parse a string according to a Greek 8-bit date format.''' |
---|
3167 | m = _greek_date_format_re.match(dateString) |
---|
3168 | if not m: |
---|
3169 | return |
---|
3170 | wday = _greek_wdays[m.group(1)] |
---|
3171 | month = _greek_months[m.group(3)] |
---|
3172 | rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ |
---|
3173 | {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\ |
---|
3174 | 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\ |
---|
3175 | 'zonediff': m.group(8)} |
---|
3176 | return _parse_date_rfc822(rfc822date) |
---|
3177 | registerDateHandler(_parse_date_greek) |
---|
3178 | |
---|
3179 | # Unicode strings for Hungarian date strings |
---|
3180 | _hungarian_months = \ |
---|
3181 | { \ |
---|
3182 | u'janu\u00e1r': u'01', # e1 in iso-8859-2 |
---|
3183 | u'febru\u00e1ri': u'02', # e1 in iso-8859-2 |
---|
3184 | u'm\u00e1rcius': u'03', # e1 in iso-8859-2 |
---|
3185 | u'\u00e1prilis': u'04', # e1 in iso-8859-2 |
---|
3186 | u'm\u00e1ujus': u'05', # e1 in iso-8859-2 |
---|
3187 | u'j\u00fanius': u'06', # fa in iso-8859-2 |
---|
3188 | u'j\u00falius': u'07', # fa in iso-8859-2 |
---|
3189 | u'augusztus': u'08', |
---|
3190 | u'szeptember': u'09', |
---|
3191 | u'okt\u00f3ber': u'10', # f3 in iso-8859-2 |
---|
3192 | u'november': u'11', |
---|
3193 | u'december': u'12', |
---|
3194 | } |
---|
3195 | |
---|
3196 | _hungarian_date_format_re = \ |
---|
3197 | re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') |
---|
3198 | |
---|
3199 | def _parse_date_hungarian(dateString): |
---|
3200 | '''Parse a string according to a Hungarian 8-bit date format.''' |
---|
3201 | m = _hungarian_date_format_re.match(dateString) |
---|
3202 | if not m or m.group(2) not in _hungarian_months: |
---|
3203 | return None |
---|
3204 | month = _hungarian_months[m.group(2)] |
---|
3205 | day = m.group(3) |
---|
3206 | if len(day) == 1: |
---|
3207 | day = '0' + day |
---|
3208 | hour = m.group(4) |
---|
3209 | if len(hour) == 1: |
---|
3210 | hour = '0' + hour |
---|
3211 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ |
---|
3212 | {'year': m.group(1), 'month': month, 'day': day,\ |
---|
3213 | 'hour': hour, 'minute': m.group(5),\ |
---|
3214 | 'zonediff': m.group(6)} |
---|
3215 | return _parse_date_w3dtf(w3dtfdate) |
---|
3216 | registerDateHandler(_parse_date_hungarian) |
---|
3217 | |
---|
3218 | timezonenames = { |
---|
3219 | 'ut': 0, 'gmt': 0, 'z': 0, |
---|
3220 | 'adt': -3, 'ast': -4, 'at': -4, |
---|
3221 | 'edt': -4, 'est': -5, 'et': -5, |
---|
3222 | 'cdt': -5, 'cst': -6, 'ct': -6, |
---|
3223 | 'mdt': -6, 'mst': -7, 'mt': -7, |
---|
3224 | 'pdt': -7, 'pst': -8, 'pt': -8, |
---|
3225 | 'a': -1, 'n': 1, |
---|
3226 | 'm': -12, 'y': 12, |
---|
3227 | } |
---|
3228 | # W3 date and time format parser |
---|
3229 | # http://www.w3.org/TR/NOTE-datetime |
---|
3230 | # Also supports MSSQL-style datetimes as defined at: |
---|
3231 | # http://msdn.microsoft.com/en-us/library/ms186724.aspx |
---|
3232 | # (basically, allow a space as a date/time/timezone separator) |
---|
3233 | def _parse_date_w3dtf(datestr): |
---|
3234 | if not datestr.strip(): |
---|
3235 | return None |
---|
3236 | parts = datestr.lower().split('t') |
---|
3237 | if len(parts) == 1: |
---|
3238 | # This may be a date only, or may be an MSSQL-style date |
---|
3239 | parts = parts[0].split() |
---|
3240 | if len(parts) == 1: |
---|
3241 | # Treat this as a date only |
---|
3242 | parts.append('00:00:00z') |
---|
3243 | elif len(parts) > 2: |
---|
3244 | return None |
---|
3245 | date = parts[0].split('-', 2) |
---|
3246 | if not date or len(date[0]) != 4: |
---|
3247 | return None |
---|
3248 | # Ensure that `date` has 3 elements. Using '1' sets the default |
---|
3249 | # month to January and the default day to the 1st of the month. |
---|
3250 | date.extend(['1'] * (3 - len(date))) |
---|
3251 | try: |
---|
3252 | year, month, day = [int(i) for i in date] |
---|
3253 | except ValueError: |
---|
3254 | # `date` may have more than 3 elements or may contain |
---|
3255 | # non-integer strings. |
---|
3256 | return None |
---|
3257 | if parts[1].endswith('z'): |
---|
3258 | parts[1] = parts[1][:-1] |
---|
3259 | parts.append('z') |
---|
3260 | # Append the numeric timezone offset, if any, to parts. |
---|
3261 | # If this is an MSSQL-style date then parts[2] already contains |
---|
3262 | # the timezone information, so `append()` will not affect it. |
---|
3263 | # Add 1 to each value so that if `find()` returns -1 it will be |
---|
3264 | # treated as False. |
---|
3265 | loc = parts[1].find('-') + 1 or parts[1].find('+') + 1 or len(parts[1]) + 1 |
---|
3266 | loc = loc - 1 |
---|
3267 | parts.append(parts[1][loc:]) |
---|
3268 | parts[1] = parts[1][:loc] |
---|
3269 | time = parts[1].split(':', 2) |
---|
3270 | # Ensure that time has 3 elements. Using '0' means that the |
---|
3271 | # minutes and seconds, if missing, will default to 0. |
---|
3272 | time.extend(['0'] * (3 - len(time))) |
---|
3273 | tzhour = 0 |
---|
3274 | tzmin = 0 |
---|
3275 | if parts[2][:1] in ('-', '+'): |
---|
3276 | try: |
---|
3277 | tzhour = int(parts[2][1:3]) |
---|
3278 | tzmin = int(parts[2][4:]) |
---|
3279 | except ValueError: |
---|
3280 | return None |
---|
3281 | if parts[2].startswith('-'): |
---|
3282 | tzhour = tzhour * -1 |
---|
3283 | tzmin = tzmin * -1 |
---|
3284 | else: |
---|
3285 | tzhour = timezonenames.get(parts[2], 0) |
---|
3286 | try: |
---|
3287 | hour, minute, second = [int(float(i)) for i in time] |
---|
3288 | except ValueError: |
---|
3289 | return None |
---|
3290 | # Create the datetime object and timezone delta objects |
---|
3291 | try: |
---|
3292 | stamp = datetime.datetime(year, month, day, hour, minute, second) |
---|
3293 | except ValueError: |
---|
3294 | return None |
---|
3295 | delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour) |
---|
3296 | # Return the date and timestamp in a UTC 9-tuple |
---|
3297 | try: |
---|
3298 | return (stamp - delta).utctimetuple() |
---|
3299 | except (OverflowError, ValueError): |
---|
3300 | # IronPython throws ValueErrors instead of OverflowErrors |
---|
3301 | return None |
---|
3302 | |
---|
3303 | registerDateHandler(_parse_date_w3dtf) |
---|
3304 | |
---|
3305 | def _parse_date_rfc822(date): |
---|
3306 | """Parse RFC 822 dates and times |
---|
3307 | http://tools.ietf.org/html/rfc822#section-5 |
---|
3308 | |
---|
3309 | There are some formatting differences that are accounted for: |
---|
3310 | 1. Years may be two or four digits. |
---|
3311 | 2. The month and day can be swapped. |
---|
3312 | 3. Additional timezone names are supported. |
---|
3313 | 4. A default time and timezone are assumed if only a date is present. |
---|
3314 | """ |
---|
3315 | daynames = set(['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']) |
---|
3316 | months = { |
---|
3317 | 'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, |
---|
3318 | 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12, |
---|
3319 | } |
---|
3320 | |
---|
3321 | parts = date.lower().split() |
---|
3322 | if len(parts) < 5: |
---|
3323 | # Assume that the time and timezone are missing |
---|
3324 | parts.extend(('00:00:00', '0000')) |
---|
3325 | # Remove the day name |
---|
3326 | if parts[0][:3] in daynames: |
---|
3327 | parts = parts[1:] |
---|
3328 | if len(parts) < 5: |
---|
3329 | # If there are still fewer than five parts, there's not enough |
---|
3330 | # information to interpret this |
---|
3331 | return None |
---|
3332 | try: |
---|
3333 | day = int(parts[0]) |
---|
3334 | except ValueError: |
---|
3335 | # Check if the day and month are swapped |
---|
3336 | if months.get(parts[0][:3]): |
---|
3337 | try: |
---|
3338 | day = int(parts[1]) |
---|
3339 | except ValueError: |
---|
3340 | return None |
---|
3341 | else: |
---|
3342 | parts[1] = parts[0] |
---|
3343 | else: |
---|
3344 | return None |
---|
3345 | month = months.get(parts[1][:3]) |
---|
3346 | if not month: |
---|
3347 | return None |
---|
3348 | try: |
---|
3349 | year = int(parts[2]) |
---|
3350 | except ValueError: |
---|
3351 | return None |
---|
3352 | # Normalize two-digit years: |
---|
3353 | # Anything in the 90's is interpreted as 1990 and on |
---|
3354 | # Anything 89 or less is interpreted as 2089 or before |
---|
3355 | if len(parts[2]) <= 2: |
---|
3356 | year += (1900, 2000)[year < 90] |
---|
3357 | timeparts = parts[3].split(':') |
---|
3358 | timeparts = timeparts + ([0] * (3 - len(timeparts))) |
---|
3359 | try: |
---|
3360 | (hour, minute, second) = map(int, timeparts) |
---|
3361 | except ValueError: |
---|
3362 | return None |
---|
3363 | tzhour = 0 |
---|
3364 | tzmin = 0 |
---|
3365 | # Strip 'Etc/' from the timezone |
---|
3366 | if parts[4].startswith('etc/'): |
---|
3367 | parts[4] = parts[4][4:] |
---|
3368 | # Normalize timezones that start with 'gmt': |
---|
3369 | # GMT-05:00 => -0500 |
---|
3370 | # GMT => GMT |
---|
3371 | if parts[4].startswith('gmt'): |
---|
3372 | parts[4] = ''.join(parts[4][3:].split(':')) or 'gmt' |
---|
3373 | # Handle timezones like '-0500', '+0500', and 'EST' |
---|
3374 | if parts[4] and parts[4][0] in ('-', '+'): |
---|
3375 | try: |
---|
3376 | tzhour = int(parts[4][1:3]) |
---|
3377 | tzmin = int(parts[4][3:]) |
---|
3378 | except ValueError: |
---|
3379 | return None |
---|
3380 | if parts[4].startswith('-'): |
---|
3381 | tzhour = tzhour * -1 |
---|
3382 | tzmin = tzmin * -1 |
---|
3383 | else: |
---|
3384 | tzhour = timezonenames.get(parts[4], 0) |
---|
3385 | # Create the datetime object and timezone delta objects |
---|
3386 | try: |
---|
3387 | stamp = datetime.datetime(year, month, day, hour, minute, second) |
---|
3388 | except ValueError: |
---|
3389 | return None |
---|
3390 | delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour) |
---|
3391 | # Return the date and timestamp in a UTC 9-tuple |
---|
3392 | try: |
---|
3393 | return (stamp - delta).utctimetuple() |
---|
3394 | except (OverflowError, ValueError): |
---|
3395 | # IronPython throws ValueErrors instead of OverflowErrors |
---|
3396 | return None |
---|
3397 | registerDateHandler(_parse_date_rfc822) |
---|
3398 | |
---|
3399 | _months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', |
---|
3400 | 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] |
---|
3401 | def _parse_date_asctime(dt): |
---|
3402 | """Parse asctime-style dates. |
---|
3403 | |
---|
3404 | Converts asctime to RFC822-compatible dates and uses the RFC822 parser |
---|
3405 | to do the actual parsing. |
---|
3406 | |
---|
3407 | Supported formats (format is standardized to the first one listed): |
---|
3408 | |
---|
3409 | * {weekday name} {month name} dd hh:mm:ss {+-tz} yyyy |
---|
3410 | * {weekday name} {month name} dd hh:mm:ss yyyy |
---|
3411 | """ |
---|
3412 | |
---|
3413 | parts = dt.split() |
---|
3414 | |
---|
3415 | # Insert a GMT timezone, if needed. |
---|
3416 | if len(parts) == 5: |
---|
3417 | parts.insert(4, '+0000') |
---|
3418 | |
---|
3419 | # Exit if there are not six parts. |
---|
3420 | if len(parts) != 6: |
---|
3421 | return None |
---|
3422 | |
---|
3423 | # Reassemble the parts in an RFC822-compatible order and parse them. |
---|
3424 | return _parse_date_rfc822(' '.join([ |
---|
3425 | parts[0], parts[2], parts[1], parts[5], parts[3], parts[4], |
---|
3426 | ])) |
---|
3427 | registerDateHandler(_parse_date_asctime) |
---|
3428 | |
---|
3429 | def _parse_date_perforce(aDateString): |
---|
3430 | """parse a date in yyyy/mm/dd hh:mm:ss TTT format""" |
---|
3431 | # Fri, 2006/09/15 08:19:53 EDT |
---|
3432 | _my_date_pattern = re.compile( \ |
---|
3433 | r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})') |
---|
3434 | |
---|
3435 | m = _my_date_pattern.search(aDateString) |
---|
3436 | if m is None: |
---|
3437 | return None |
---|
3438 | dow, year, month, day, hour, minute, second, tz = m.groups() |
---|
3439 | months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] |
---|
3440 | dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz) |
---|
3441 | tm = rfc822.parsedate_tz(dateString) |
---|
3442 | if tm: |
---|
3443 | return time.gmtime(rfc822.mktime_tz(tm)) |
---|
3444 | registerDateHandler(_parse_date_perforce) |
---|
3445 | |
---|
3446 | def _parse_date(dateString): |
---|
3447 | '''Parses a variety of date formats into a 9-tuple in GMT''' |
---|
3448 | if not dateString: |
---|
3449 | return None |
---|
3450 | for handler in _date_handlers: |
---|
3451 | try: |
---|
3452 | date9tuple = handler(dateString) |
---|
3453 | except (KeyError, OverflowError, ValueError): |
---|
3454 | continue |
---|
3455 | if not date9tuple: |
---|
3456 | continue |
---|
3457 | if len(date9tuple) != 9: |
---|
3458 | continue |
---|
3459 | return date9tuple |
---|
3460 | return None |
---|
3461 | |
---|
3462 | # Each marker represents some of the characters of the opening XML |
---|
3463 | # processing instruction ('<?xm') in the specified encoding. |
---|
3464 | EBCDIC_MARKER = _l2bytes([0x4C, 0x6F, 0xA7, 0x94]) |
---|
3465 | UTF16BE_MARKER = _l2bytes([0x00, 0x3C, 0x00, 0x3F]) |
---|
3466 | UTF16LE_MARKER = _l2bytes([0x3C, 0x00, 0x3F, 0x00]) |
---|
3467 | UTF32BE_MARKER = _l2bytes([0x00, 0x00, 0x00, 0x3C]) |
---|
3468 | UTF32LE_MARKER = _l2bytes([0x3C, 0x00, 0x00, 0x00]) |
---|
3469 | |
---|
3470 | ZERO_BYTES = _l2bytes([0x00, 0x00]) |
---|
3471 | |
---|
3472 | # Match the opening XML declaration. |
---|
3473 | # Example: <?xml version="1.0" encoding="utf-8"?> |
---|
3474 | RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>') |
---|
3475 | |
---|
3476 | # Capture the value of the XML processing instruction's encoding attribute. |
---|
3477 | # Example: <?xml version="1.0" encoding="utf-8"?> |
---|
3478 | RE_XML_PI_ENCODING = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')) |
---|
3479 | |
---|
3480 | def convert_to_utf8(http_headers, data): |
---|
3481 | '''Detect and convert the character encoding to UTF-8. |
---|
3482 | |
---|
3483 | http_headers is a dictionary |
---|
3484 | data is a raw string (not Unicode)''' |
---|
3485 | |
---|
3486 | # This is so much trickier than it sounds, it's not even funny. |
---|
3487 | # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type |
---|
3488 | # is application/xml, application/*+xml, |
---|
3489 | # application/xml-external-parsed-entity, or application/xml-dtd, |
---|
3490 | # the encoding given in the charset parameter of the HTTP Content-Type |
---|
3491 | # takes precedence over the encoding given in the XML prefix within the |
---|
3492 | # document, and defaults to 'utf-8' if neither are specified. But, if |
---|
3493 | # the HTTP Content-Type is text/xml, text/*+xml, or |
---|
3494 | # text/xml-external-parsed-entity, the encoding given in the XML prefix |
---|
3495 | # within the document is ALWAYS IGNORED and only the encoding given in |
---|
3496 | # the charset parameter of the HTTP Content-Type header should be |
---|
3497 | # respected, and it defaults to 'us-ascii' if not specified. |
---|
3498 | |
---|
3499 | # Furthermore, discussion on the atom-syntax mailing list with the |
---|
3500 | # author of RFC 3023 leads me to the conclusion that any document |
---|
3501 | # served with a Content-Type of text/* and no charset parameter |
---|
3502 | # must be treated as us-ascii. (We now do this.) And also that it |
---|
3503 | # must always be flagged as non-well-formed. (We now do this too.) |
---|
3504 | |
---|
3505 | # If Content-Type is unspecified (input was local file or non-HTTP source) |
---|
3506 | # or unrecognized (server just got it totally wrong), then go by the |
---|
3507 | # encoding given in the XML prefix of the document and default to |
---|
3508 | # 'iso-8859-1' as per the HTTP specification (RFC 2616). |
---|
3509 | |
---|
3510 | # Then, assuming we didn't find a character encoding in the HTTP headers |
---|
3511 | # (and the HTTP Content-type allowed us to look in the body), we need |
---|
3512 | # to sniff the first few bytes of the XML data and try to determine |
---|
3513 | # whether the encoding is ASCII-compatible. Section F of the XML |
---|
3514 | # specification shows the way here: |
---|
3515 | # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info |
---|
3516 | |
---|
3517 | # If the sniffed encoding is not ASCII-compatible, we need to make it |
---|
3518 | # ASCII compatible so that we can sniff further into the XML declaration |
---|
3519 | # to find the encoding attribute, which will tell us the true encoding. |
---|
3520 | |
---|
3521 | # Of course, none of this guarantees that we will be able to parse the |
---|
3522 | # feed in the declared character encoding (assuming it was declared |
---|
3523 | # correctly, which many are not). iconv_codec can help a lot; |
---|
3524 | # you should definitely install it if you can. |
---|
3525 | # http://cjkpython.i18n.org/ |
---|
3526 | |
---|
3527 | bom_encoding = u'' |
---|
3528 | xml_encoding = u'' |
---|
3529 | rfc3023_encoding = u'' |
---|
3530 | |
---|
3531 | # Look at the first few bytes of the document to guess what |
---|
3532 | # its encoding may be. We only need to decode enough of the |
---|
3533 | # document that we can use an ASCII-compatible regular |
---|
3534 | # expression to search for an XML encoding declaration. |
---|
3535 | # The heuristic follows the XML specification, section F: |
---|
3536 | # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info |
---|
3537 | # Check for BOMs first. |
---|
3538 | if data[:4] == codecs.BOM_UTF32_BE: |
---|
3539 | bom_encoding = u'utf-32be' |
---|
3540 | data = data[4:] |
---|
3541 | elif data[:4] == codecs.BOM_UTF32_LE: |
---|
3542 | bom_encoding = u'utf-32le' |
---|
3543 | data = data[4:] |
---|
3544 | elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES: |
---|
3545 | bom_encoding = u'utf-16be' |
---|
3546 | data = data[2:] |
---|
3547 | elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES: |
---|
3548 | bom_encoding = u'utf-16le' |
---|
3549 | data = data[2:] |
---|
3550 | elif data[:3] == codecs.BOM_UTF8: |
---|
3551 | bom_encoding = u'utf-8' |
---|
3552 | data = data[3:] |
---|
3553 | # Check for the characters '<?xm' in several encodings. |
---|
3554 | elif data[:4] == EBCDIC_MARKER: |
---|
3555 | bom_encoding = u'cp037' |
---|
3556 | elif data[:4] == UTF16BE_MARKER: |
---|
3557 | bom_encoding = u'utf-16be' |
---|
3558 | elif data[:4] == UTF16LE_MARKER: |
---|
3559 | bom_encoding = u'utf-16le' |
---|
3560 | elif data[:4] == UTF32BE_MARKER: |
---|
3561 | bom_encoding = u'utf-32be' |
---|
3562 | elif data[:4] == UTF32LE_MARKER: |
---|
3563 | bom_encoding = u'utf-32le' |
---|
3564 | |
---|
3565 | tempdata = data |
---|
3566 | try: |
---|
3567 | if bom_encoding: |
---|
3568 | tempdata = data.decode(bom_encoding).encode('utf-8') |
---|
3569 | except (UnicodeDecodeError, LookupError): |
---|
3570 | # feedparser recognizes UTF-32 encodings that aren't |
---|
3571 | # available in Python 2.4 and 2.5, so it's possible to |
---|
3572 | # encounter a LookupError during decoding. |
---|
3573 | xml_encoding_match = None |
---|
3574 | else: |
---|
3575 | xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata) |
---|
3576 | |
---|
3577 | if xml_encoding_match: |
---|
3578 | xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower() |
---|
3579 | # Normalize the xml_encoding if necessary. |
---|
3580 | if bom_encoding and (xml_encoding in ( |
---|
3581 | u'u16', u'utf-16', u'utf16', u'utf_16', |
---|
3582 | u'u32', u'utf-32', u'utf32', u'utf_32', |
---|
3583 | u'iso-10646-ucs-2', u'iso-10646-ucs-4', |
---|
3584 | u'csucs4', u'csunicode', u'ucs-2', u'ucs-4' |
---|
3585 | )): |
---|
3586 | xml_encoding = bom_encoding |
---|
3587 | |
---|
3588 | # Find the HTTP Content-Type and, hopefully, a character |
---|
3589 | # encoding provided by the server. The Content-Type is used |
---|
3590 | # to choose the "correct" encoding among the BOM encoding, |
---|
3591 | # XML declaration encoding, and HTTP encoding, following the |
---|
3592 | # heuristic defined in RFC 3023. |
---|
3593 | http_content_type = http_headers.get('content-type') or '' |
---|
3594 | http_content_type, params = cgi.parse_header(http_content_type) |
---|
3595 | http_encoding = params.get('charset', '').replace("'", "") |
---|
3596 | if not isinstance(http_encoding, unicode): |
---|
3597 | http_encoding = http_encoding.decode('utf-8', 'ignore') |
---|
3598 | |
---|
3599 | acceptable_content_type = 0 |
---|
3600 | application_content_types = (u'application/xml', u'application/xml-dtd', |
---|
3601 | u'application/xml-external-parsed-entity') |
---|
3602 | text_content_types = (u'text/xml', u'text/xml-external-parsed-entity') |
---|
3603 | if (http_content_type in application_content_types) or \ |
---|
3604 | (http_content_type.startswith(u'application/') and |
---|
3605 | http_content_type.endswith(u'+xml')): |
---|
3606 | acceptable_content_type = 1 |
---|
3607 | rfc3023_encoding = http_encoding or xml_encoding or u'utf-8' |
---|
3608 | elif (http_content_type in text_content_types) or \ |
---|
3609 | (http_content_type.startswith(u'text/') and |
---|
3610 | http_content_type.endswith(u'+xml')): |
---|
3611 | acceptable_content_type = 1 |
---|
3612 | rfc3023_encoding = http_encoding or u'us-ascii' |
---|
3613 | elif http_content_type.startswith(u'text/'): |
---|
3614 | rfc3023_encoding = http_encoding or u'us-ascii' |
---|
3615 | elif http_headers and 'content-type' not in http_headers: |
---|
3616 | rfc3023_encoding = xml_encoding or u'iso-8859-1' |
---|
3617 | else: |
---|
3618 | rfc3023_encoding = xml_encoding or u'utf-8' |
---|
3619 | # gb18030 is a superset of gb2312, so always replace gb2312 |
---|
3620 | # with gb18030 for greater compatibility. |
---|
3621 | if rfc3023_encoding.lower() == u'gb2312': |
---|
3622 | rfc3023_encoding = u'gb18030' |
---|
3623 | if xml_encoding.lower() == u'gb2312': |
---|
3624 | xml_encoding = u'gb18030' |
---|
3625 | |
---|
3626 | # there are four encodings to keep track of: |
---|
3627 | # - http_encoding is the encoding declared in the Content-Type HTTP header |
---|
3628 | # - xml_encoding is the encoding declared in the <?xml declaration |
---|
3629 | # - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data |
---|
3630 | # - rfc3023_encoding is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications |
---|
3631 | error = None |
---|
3632 | |
---|
3633 | if http_headers and (not acceptable_content_type): |
---|
3634 | if 'content-type' in http_headers: |
---|
3635 | msg = '%s is not an XML media type' % http_headers['content-type'] |
---|
3636 | else: |
---|
3637 | msg = 'no Content-type specified' |
---|
3638 | error = NonXMLContentType(msg) |
---|
3639 | |
---|
3640 | # determine character encoding |
---|
3641 | known_encoding = 0 |
---|
3642 | lazy_chardet_encoding = None |
---|
3643 | tried_encodings = [] |
---|
3644 | if chardet: |
---|
3645 | def lazy_chardet_encoding(): |
---|
3646 | chardet_encoding = chardet.detect(data)['encoding'] |
---|
3647 | if not chardet_encoding: |
---|
3648 | chardet_encoding = '' |
---|
3649 | if not isinstance(chardet_encoding, unicode): |
---|
3650 | chardet_encoding = unicode(chardet_encoding, 'ascii', 'ignore') |
---|
3651 | return chardet_encoding |
---|
3652 | # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM |
---|
3653 | for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding, |
---|
3654 | lazy_chardet_encoding, u'utf-8', u'windows-1252', u'iso-8859-2'): |
---|
3655 | if callable(proposed_encoding): |
---|
3656 | proposed_encoding = proposed_encoding() |
---|
3657 | if not proposed_encoding: |
---|
3658 | continue |
---|
3659 | if proposed_encoding in tried_encodings: |
---|
3660 | continue |
---|
3661 | tried_encodings.append(proposed_encoding) |
---|
3662 | try: |
---|
3663 | data = data.decode(proposed_encoding) |
---|
3664 | except (UnicodeDecodeError, LookupError): |
---|
3665 | pass |
---|
3666 | else: |
---|
3667 | known_encoding = 1 |
---|
3668 | # Update the encoding in the opening XML processing instruction. |
---|
3669 | new_declaration = '''<?xml version='1.0' encoding='utf-8'?>''' |
---|
3670 | if RE_XML_DECLARATION.search(data): |
---|
3671 | data = RE_XML_DECLARATION.sub(new_declaration, data) |
---|
3672 | else: |
---|
3673 | data = new_declaration + u'\n' + data |
---|
3674 | data = data.encode('utf-8') |
---|
3675 | break |
---|
3676 | # if still no luck, give up |
---|
3677 | if not known_encoding: |
---|
3678 | error = CharacterEncodingUnknown( |
---|
3679 | 'document encoding unknown, I tried ' + |
---|
3680 | '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % |
---|
3681 | (rfc3023_encoding, xml_encoding)) |
---|
3682 | rfc3023_encoding = u'' |
---|
3683 | elif proposed_encoding != rfc3023_encoding: |
---|
3684 | error = CharacterEncodingOverride( |
---|
3685 | 'document declared as %s, but parsed as %s' % |
---|
3686 | (rfc3023_encoding, proposed_encoding)) |
---|
3687 | rfc3023_encoding = proposed_encoding |
---|
3688 | |
---|
3689 | return data, rfc3023_encoding, error |
---|
3690 | |
---|
3691 | # Match XML entity declarations. |
---|
3692 | # Example: <!ENTITY copyright "(C)"> |
---|
3693 | RE_ENTITY_PATTERN = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE) |
---|
3694 | |
---|
3695 | # Match XML DOCTYPE declarations. |
---|
3696 | # Example: <!DOCTYPE feed [ ]> |
---|
3697 | RE_DOCTYPE_PATTERN = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), re.MULTILINE) |
---|
3698 | |
---|
3699 | # Match safe entity declarations. |
---|
3700 | # This will allow hexadecimal character references through, |
---|
3701 | # as well as text, but not arbitrary nested entities. |
---|
3702 | # Example: cubed "³" |
---|
3703 | # Example: copyright "(C)" |
---|
3704 | # Forbidden: explode1 "&explode2;&explode2;" |
---|
3705 | RE_SAFE_ENTITY_PATTERN = re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"')) |
---|
3706 | |
---|
3707 | def replace_doctype(data): |
---|
3708 | '''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data) |
---|
3709 | |
---|
3710 | rss_version may be 'rss091n' or None |
---|
3711 | stripped_data is the same XML document with a replaced DOCTYPE |
---|
3712 | ''' |
---|
3713 | |
---|
3714 | # Divide the document into two groups by finding the location |
---|
3715 | # of the first element that doesn't begin with '<?' or '<!'. |
---|
3716 | start = re.search(_s2bytes('<\w'), data) |
---|
3717 | start = start and start.start() or -1 |
---|
3718 | head, data = data[:start+1], data[start+1:] |
---|
3719 | |
---|
3720 | # Save and then remove all of the ENTITY declarations. |
---|
3721 | entity_results = RE_ENTITY_PATTERN.findall(head) |
---|
3722 | head = RE_ENTITY_PATTERN.sub(_s2bytes(''), head) |
---|
3723 | |
---|
3724 | # Find the DOCTYPE declaration and check the feed type. |
---|
3725 | doctype_results = RE_DOCTYPE_PATTERN.findall(head) |
---|
3726 | doctype = doctype_results and doctype_results[0] or _s2bytes('') |
---|
3727 | if _s2bytes('netscape') in doctype.lower(): |
---|
3728 | version = u'rss091n' |
---|
3729 | else: |
---|
3730 | version = None |
---|
3731 | |
---|
3732 | # Re-insert the safe ENTITY declarations if a DOCTYPE was found. |
---|
3733 | replacement = _s2bytes('') |
---|
3734 | if len(doctype_results) == 1 and entity_results: |
---|
3735 | match_safe_entities = lambda e: RE_SAFE_ENTITY_PATTERN.match(e) |
---|
3736 | safe_entities = filter(match_safe_entities, entity_results) |
---|
3737 | if safe_entities: |
---|
3738 | replacement = _s2bytes('<!DOCTYPE feed [\n<!ENTITY') \ |
---|
3739 | + _s2bytes('>\n<!ENTITY ').join(safe_entities) \ |
---|
3740 | + _s2bytes('>\n]>') |
---|
3741 | data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data |
---|
3742 | |
---|
3743 | # Precompute the safe entities for the loose parser. |
---|
3744 | safe_entities = dict((k.decode('utf-8'), v.decode('utf-8')) |
---|
3745 | for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)) |
---|
3746 | return version, data, safe_entities |
---|
3747 | |
---|
3748 | |
---|
3749 | # GeoRSS geometry parsers. Each return a dict with 'type' and 'coordinates' |
---|
3750 | # items, or None in the case of a parsing error. |
---|
3751 | |
---|
3752 | def _parse_poslist(value, geom_type, swap=True, dims=2): |
---|
3753 | if geom_type == 'linestring': |
---|
3754 | return _parse_georss_line(value, swap, dims) |
---|
3755 | elif geom_type == 'polygon': |
---|
3756 | ring = _parse_georss_line(value, swap, dims) |
---|
3757 | return {'type': u'Polygon', 'coordinates': (ring['coordinates'],)} |
---|
3758 | else: |
---|
3759 | return None |
---|
3760 | |
---|
3761 | def _gen_georss_coords(value, swap=True, dims=2): |
---|
3762 | # A generator of (lon, lat) pairs from a string of encoded GeoRSS |
---|
3763 | # coordinates. Converts to floats and swaps order. |
---|
3764 | latlons = itertools.imap(float, value.strip().replace(',', ' ').split()) |
---|
3765 | nxt = latlons.next |
---|
3766 | while True: |
---|
3767 | t = [nxt(), nxt()][::swap and -1 or 1] |
---|
3768 | if dims == 3: |
---|
3769 | t.append(nxt()) |
---|
3770 | yield tuple(t) |
---|
3771 | |
---|
3772 | def _parse_georss_point(value, swap=True, dims=2): |
---|
3773 | # A point contains a single latitude-longitude pair, separated by |
---|
3774 | # whitespace. We'll also handle comma separators. |
---|
3775 | try: |
---|
3776 | coords = list(_gen_georss_coords(value, swap, dims)) |
---|
3777 | return {u'type': u'Point', u'coordinates': coords[0]} |
---|
3778 | except (IndexError, ValueError): |
---|
3779 | return None |
---|
3780 | |
---|
3781 | def _parse_georss_line(value, swap=True, dims=2): |
---|
3782 | # A line contains a space separated list of latitude-longitude pairs in |
---|
3783 | # WGS84 coordinate reference system, with each pair separated by |
---|
3784 | # whitespace. There must be at least two pairs. |
---|
3785 | try: |
---|
3786 | coords = list(_gen_georss_coords(value, swap, dims)) |
---|
3787 | return {u'type': u'LineString', u'coordinates': coords} |
---|
3788 | except (IndexError, ValueError): |
---|
3789 | return None |
---|
3790 | |
---|
3791 | def _parse_georss_polygon(value, swap=True, dims=2): |
---|
3792 | # A polygon contains a space separated list of latitude-longitude pairs, |
---|
3793 | # with each pair separated by whitespace. There must be at least four |
---|
3794 | # pairs, with the last being identical to the first (so a polygon has a |
---|
3795 | # minimum of three actual points). |
---|
3796 | try: |
---|
3797 | ring = list(_gen_georss_coords(value, swap, dims)) |
---|
3798 | except (IndexError, ValueError): |
---|
3799 | return None |
---|
3800 | if len(ring) < 4: |
---|
3801 | return None |
---|
3802 | return {u'type': u'Polygon', u'coordinates': (ring,)} |
---|
3803 | |
---|
3804 | def _parse_georss_box(value, swap=True, dims=2): |
---|
3805 | # A bounding box is a rectangular region, often used to define the extents |
---|
3806 | # of a map or a rough area of interest. A box contains two space seperate |
---|
3807 | # latitude-longitude pairs, with each pair separated by whitespace. The |
---|
3808 | # first pair is the lower corner, the second is the upper corner. |
---|
3809 | try: |
---|
3810 | coords = list(_gen_georss_coords(value, swap, dims)) |
---|
3811 | return {u'type': u'Box', u'coordinates': tuple(coords)} |
---|
3812 | except (IndexError, ValueError): |
---|
3813 | return None |
---|
3814 | |
---|
3815 | # end geospatial parsers |
---|
3816 | |
---|
3817 | |
---|
3818 | def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None): |
---|
3819 | '''Parse a feed from a URL, file, stream, or string. |
---|
3820 | |
---|
3821 | request_headers, if given, is a dict from http header name to value to add |
---|
3822 | to the request; this overrides internally generated values. |
---|
3823 | |
---|
3824 | :return: A :class:`FeedParserDict`. |
---|
3825 | ''' |
---|
3826 | |
---|
3827 | if handlers is None: |
---|
3828 | handlers = [] |
---|
3829 | if request_headers is None: |
---|
3830 | request_headers = {} |
---|
3831 | if response_headers is None: |
---|
3832 | response_headers = {} |
---|
3833 | |
---|
3834 | result = FeedParserDict() |
---|
3835 | result['feed'] = FeedParserDict() |
---|
3836 | result['entries'] = [] |
---|
3837 | result['bozo'] = 0 |
---|
3838 | if not isinstance(handlers, list): |
---|
3839 | handlers = [handlers] |
---|
3840 | try: |
---|
3841 | f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers) |
---|
3842 | data = f.read() |
---|
3843 | except Exception, e: |
---|
3844 | result['bozo'] = 1 |
---|
3845 | result['bozo_exception'] = e |
---|
3846 | data = None |
---|
3847 | f = None |
---|
3848 | |
---|
3849 | if hasattr(f, 'headers'): |
---|
3850 | result['headers'] = dict(f.headers) |
---|
3851 | # overwrite existing headers using response_headers |
---|
3852 | if 'headers' in result: |
---|
3853 | result['headers'].update(response_headers) |
---|
3854 | elif response_headers: |
---|
3855 | result['headers'] = copy.deepcopy(response_headers) |
---|
3856 | |
---|
3857 | # lowercase all of the HTTP headers for comparisons per RFC 2616 |
---|
3858 | if 'headers' in result: |
---|
3859 | http_headers = dict((k.lower(), v) for k, v in result['headers'].items()) |
---|
3860 | else: |
---|
3861 | http_headers = {} |
---|
3862 | |
---|
3863 | # if feed is gzip-compressed, decompress it |
---|
3864 | if f and data and http_headers: |
---|
3865 | if gzip and 'gzip' in http_headers.get('content-encoding', ''): |
---|
3866 | try: |
---|
3867 | data = gzip.GzipFile(fileobj=_StringIO(data)).read() |
---|
3868 | except (IOError, struct.error), e: |
---|
3869 | # IOError can occur if the gzip header is bad. |
---|
3870 | # struct.error can occur if the data is damaged. |
---|
3871 | result['bozo'] = 1 |
---|
3872 | result['bozo_exception'] = e |
---|
3873 | if isinstance(e, struct.error): |
---|
3874 | # A gzip header was found but the data is corrupt. |
---|
3875 | # Ideally, we should re-request the feed without the |
---|
3876 | # 'Accept-encoding: gzip' header, but we don't. |
---|
3877 | data = None |
---|
3878 | elif zlib and 'deflate' in http_headers.get('content-encoding', ''): |
---|
3879 | try: |
---|
3880 | data = zlib.decompress(data) |
---|
3881 | except zlib.error, e: |
---|
3882 | try: |
---|
3883 | # The data may have no headers and no checksum. |
---|
3884 | data = zlib.decompress(data, -15) |
---|
3885 | except zlib.error, e: |
---|
3886 | result['bozo'] = 1 |
---|
3887 | result['bozo_exception'] = e |
---|
3888 | |
---|
3889 | # save HTTP headers |
---|
3890 | if http_headers: |
---|
3891 | if 'etag' in http_headers: |
---|
3892 | etag = http_headers.get('etag', u'') |
---|
3893 | if not isinstance(etag, unicode): |
---|
3894 | etag = etag.decode('utf-8', 'ignore') |
---|
3895 | if etag: |
---|
3896 | result['etag'] = etag |
---|
3897 | if 'last-modified' in http_headers: |
---|
3898 | modified = http_headers.get('last-modified', u'') |
---|
3899 | if modified: |
---|
3900 | result['modified'] = modified |
---|
3901 | result['modified_parsed'] = _parse_date(modified) |
---|
3902 | if hasattr(f, 'url'): |
---|
3903 | if not isinstance(f.url, unicode): |
---|
3904 | result['href'] = f.url.decode('utf-8', 'ignore') |
---|
3905 | else: |
---|
3906 | result['href'] = f.url |
---|
3907 | result['status'] = 200 |
---|
3908 | if hasattr(f, 'status'): |
---|
3909 | result['status'] = f.status |
---|
3910 | if hasattr(f, 'close'): |
---|
3911 | f.close() |
---|
3912 | |
---|
3913 | if data is None: |
---|
3914 | return result |
---|
3915 | |
---|
3916 | # Stop processing if the server sent HTTP 304 Not Modified. |
---|
3917 | if getattr(f, 'code', 0) == 304: |
---|
3918 | result['version'] = u'' |
---|
3919 | result['debug_message'] = 'The feed has not changed since you last checked, ' + \ |
---|
3920 | 'so the server sent no data. This is a feature, not a bug!' |
---|
3921 | return result |
---|
3922 | |
---|
3923 | data, result['encoding'], error = convert_to_utf8(http_headers, data) |
---|
3924 | use_strict_parser = result['encoding'] and True or False |
---|
3925 | if error is not None: |
---|
3926 | result['bozo'] = 1 |
---|
3927 | result['bozo_exception'] = error |
---|
3928 | |
---|
3929 | result['version'], data, entities = replace_doctype(data) |
---|
3930 | |
---|
3931 | # Ensure that baseuri is an absolute URI using an acceptable URI scheme. |
---|
3932 | contentloc = http_headers.get('content-location', u'') |
---|
3933 | href = result.get('href', u'') |
---|
3934 | baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href |
---|
3935 | |
---|
3936 | baselang = http_headers.get('content-language', None) |
---|
3937 | if not isinstance(baselang, unicode) and baselang is not None: |
---|
3938 | baselang = baselang.decode('utf-8', 'ignore') |
---|
3939 | |
---|
3940 | if not _XML_AVAILABLE: |
---|
3941 | use_strict_parser = 0 |
---|
3942 | if use_strict_parser: |
---|
3943 | # initialize the SAX parser |
---|
3944 | feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8') |
---|
3945 | saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS) |
---|
3946 | saxparser.setFeature(xml.sax.handler.feature_namespaces, 1) |
---|
3947 | try: |
---|
3948 | # disable downloading external doctype references, if possible |
---|
3949 | saxparser.setFeature(xml.sax.handler.feature_external_ges, 0) |
---|
3950 | except xml.sax.SAXNotSupportedException: |
---|
3951 | pass |
---|
3952 | saxparser.setContentHandler(feedparser) |
---|
3953 | saxparser.setErrorHandler(feedparser) |
---|
3954 | source = xml.sax.xmlreader.InputSource() |
---|
3955 | source.setByteStream(_StringIO(data)) |
---|
3956 | try: |
---|
3957 | saxparser.parse(source) |
---|
3958 | except xml.sax.SAXException, e: |
---|
3959 | result['bozo'] = 1 |
---|
3960 | result['bozo_exception'] = feedparser.exc or e |
---|
3961 | use_strict_parser = 0 |
---|
3962 | if not use_strict_parser and _SGML_AVAILABLE: |
---|
3963 | feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities) |
---|
3964 | feedparser.feed(data.decode('utf-8', 'replace')) |
---|
3965 | result['feed'] = feedparser.feeddata |
---|
3966 | result['entries'] = feedparser.entries |
---|
3967 | result['version'] = result['version'] or feedparser.version |
---|
3968 | result['namespaces'] = feedparser.namespacesInUse |
---|
3969 | return result |
---|
3970 | |
---|
3971 | # The list of EPSG codes for geographic (latitude/longitude) coordinate |
---|
3972 | # systems to support decoding of GeoRSS GML profiles. |
---|
3973 | _geogCS = [ |
---|
3974 | 3819, 3821, 3824, 3889, 3906, 4001, 4002, 4003, 4004, 4005, 4006, 4007, 4008, |
---|
3975 | 4009, 4010, 4011, 4012, 4013, 4014, 4015, 4016, 4018, 4019, 4020, 4021, 4022, |
---|
3976 | 4023, 4024, 4025, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036, |
---|
3977 | 4041, 4042, 4043, 4044, 4045, 4046, 4047, 4052, 4053, 4054, 4055, 4075, 4081, |
---|
3978 | 4120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132, |
---|
3979 | 4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145, |
---|
3980 | 4146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158, |
---|
3981 | 4159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171, |
---|
3982 | 4172, 4173, 4174, 4175, 4176, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185, |
---|
3983 | 4188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200, |
---|
3984 | 4201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 4209, 4210, 4211, 4212, 4213, |
---|
3985 | 4214, 4215, 4216, 4218, 4219, 4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227, |
---|
3986 | 4228, 4229, 4230, 4231, 4232, 4233, 4234, 4235, 4236, 4237, 4238, 4239, 4240, |
---|
3987 | 4241, 4242, 4243, 4244, 4245, 4246, 4247, 4248, 4249, 4250, 4251, 4252, 4253, |
---|
3988 | 4254, 4255, 4256, 4257, 4258, 4259, 4260, 4261, 4262, 4263, 4264, 4265, 4266, |
---|
3989 | 4267, 4268, 4269, 4270, 4271, 4272, 4273, 4274, 4275, 4276, 4277, 4278, 4279, |
---|
3990 | 4280, 4281, 4282, 4283, 4284, 4285, 4286, 4287, 4288, 4289, 4291, 4292, 4293, |
---|
3991 | 4294, 4295, 4296, 4297, 4298, 4299, 4300, 4301, 4302, 4303, 4304, 4306, 4307, |
---|
3992 | 4308, 4309, 4310, 4311, 4312, 4313, 4314, 4315, 4316, 4317, 4318, 4319, 4322, |
---|
3993 | 4324, 4326, 4463, 4470, 4475, 4483, 4490, 4555, 4558, 4600, 4601, 4602, 4603, |
---|
3994 | 4604, 4605, 4606, 4607, 4608, 4609, 4610, 4611, 4612, 4613, 4614, 4615, 4616, |
---|
3995 | 4617, 4618, 4619, 4620, 4621, 4622, 4623, 4624, 4625, 4626, 4627, 4628, 4629, |
---|
3996 | 4630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 4640, 4641, 4642, |
---|
3997 | 4643, 4644, 4645, 4646, 4657, 4658, 4659, 4660, 4661, 4662, 4663, 4664, 4665, |
---|
3998 | 4666, 4667, 4668, 4669, 4670, 4671, 4672, 4673, 4674, 4675, 4676, 4677, 4678, |
---|
3999 | 4679, 4680, 4681, 4682, 4683, 4684, 4685, 4686, 4687, 4688, 4689, 4690, 4691, |
---|
4000 | 4692, 4693, 4694, 4695, 4696, 4697, 4698, 4699, 4700, 4701, 4702, 4703, 4704, |
---|
4001 | 4705, 4706, 4707, 4708, 4709, 4710, 4711, 4712, 4713, 4714, 4715, 4716, 4717, |
---|
4002 | 4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4726, 4727, 4728, 4729, 4730, |
---|
4003 | 4731, 4732, 4733, 4734, 4735, 4736, 4737, 4738, 4739, 4740, 4741, 4742, 4743, |
---|
4004 | 4744, 4745, 4746, 4747, 4748, 4749, 4750, 4751, 4752, 4753, 4754, 4755, 4756, |
---|
4005 | 4757, 4758, 4759, 4760, 4761, 4762, 4763, 4764, 4765, 4801, 4802, 4803, 4804, |
---|
4006 | 4805, 4806, 4807, 4808, 4809, 4810, 4811, 4813, 4814, 4815, 4816, 4817, 4818, |
---|
4007 | 4819, 4820, 4821, 4823, 4824, 4901, 4902, 4903, 4904, 4979 ] |
---|