diff options
author | Walter Purcaro <vuolter@gmail.com> | 2014-03-28 22:32:14 +0100 |
---|---|---|
committer | Walter Purcaro <vuolter@gmail.com> | 2014-06-28 02:47:08 +0200 |
commit | b1fffc3a1b2dbbb807213b85f538e59251b9bf35 (patch) | |
tree | c373d3234dcb474bb424371a3d89341bed8a9e07 /module/lib/feedparser.py | |
parent | Plugins licensing doc (diff) | |
download | pyload-b1fffc3a1b2dbbb807213b85f538e59251b9bf35.tar.xz |
Remove bad whitespaces
Merged vuolter/pyload@00288e6
Diffstat (limited to 'module/lib/feedparser.py')
-rw-r--r-- | module/lib/feedparser.py | 192 |
1 files changed, 96 insertions, 96 deletions
diff --git a/module/lib/feedparser.py b/module/lib/feedparser.py index a746ed8f5..32f9d2dd7 100644 --- a/module/lib/feedparser.py +++ b/module/lib/feedparser.py @@ -89,7 +89,7 @@ try: except (NameError, AttributeError): import string _maketrans = string.maketrans - + # base64 support for Atom feeds that contain embedded binary data try: import base64, binascii @@ -334,7 +334,7 @@ class FeedParserDict(UserDict): if not self.has_key(key): self[key] = value return self[key] - + def has_key(self, key): try: return hasattr(self, key) or UserDict.__contains__(self, key) @@ -343,7 +343,7 @@ class FeedParserDict(UserDict): # This alias prevents the 2to3 tool from changing the semantics of the # __contains__ function below and exhausting the maximum recursion depth __has_key = has_key - + def __getattr__(self, key): try: return self.__dict__[key] @@ -451,7 +451,7 @@ class _FeedParserMixin: 'http://purl.org/atom/ns#': '', 'http://www.w3.org/2005/Atom': '', 'http://purl.org/rss/1.0/modules/rss091#': '', - + 'http://webns.net/mvcb/': 'admin', 'http://purl.org/rss/1.0/modules/aggregation/': 'ag', 'http://purl.org/rss/1.0/modules/annotate/': 'annotate', @@ -508,7 +508,7 @@ class _FeedParserMixin: can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] html_types = ['text/html', 'application/xhtml+xml'] - + def __init__(self, baseuri=None, baselang=None, encoding='utf-8'): if _debug: sys.stderr.write('initializing FeedParser\n') if not self._matchnamespaces: @@ -554,7 +554,7 @@ class _FeedParserMixin: # strict xml parsers do -- account for this difference if isinstance(self, _LooseFeedParser): attrs = [(k, v.replace('&', '&')) for k, v in attrs] - + # track xml:base and xml:lang attrsD = dict(attrs) baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri @@ -582,7 +582,7 @@ class _FeedParserMixin: self.lang = lang self.basestack.append(self.baseuri) self.langstack.append(lang) - + # track namespaces for prefix, uri in attrs: if prefix.startswith('xmlns:'): @@ -620,7 +620,7 @@ class _FeedParserMixin: self.intextinput = 0 if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'): self.inimage = 0 - + # call special handler (if defined) or default handler methodname = '_start_' + prefix + suffix try: @@ -754,7 +754,7 @@ class _FeedParserMixin: elif contentType == 'xhtml': contentType = 'application/xhtml+xml' return contentType - + def trackNamespace(self, prefix, uri): loweruri = uri.lower() if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: @@ -775,7 +775,7 @@ class _FeedParserMixin: def resolveURI(self, uri): return _urljoin(self.baseuri or '', uri) - + def decodeEntities(self, element, data): return data @@ -788,7 +788,7 @@ class _FeedParserMixin: def pop(self, element, stripWhitespace=1): if not self.elementstack: return if self.elementstack[-1][0] != element: return - + element, expectingText, pieces = self.elementstack.pop() if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml': @@ -833,11 +833,11 @@ class _FeedParserMixin: # In Python 3, base64 takes and outputs bytes, not str # This may not be the most correct way to accomplish this output = _base64decode(output.encode('utf-8')).decode('utf-8') - + # resolve relative URIs if (element in self.can_be_relative_uri) and output: output = self.resolveURI(output) - + # decode entities within embedded markup if not self.contentparams.get('base64', 0): output = self.decodeEntities(element, output) @@ -860,7 +860,7 @@ class _FeedParserMixin: if is_htmlish and RESOLVE_RELATIVE_URIS: if element in self.can_contain_relative_uris: output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) - + # parse microformats # (must do this before sanitizing because some microformats # rely on elements that we sanitize) @@ -876,7 +876,7 @@ class _FeedParserMixin: vcard = mfresults.get('vcard') if vcard: self._getContext()['vcard'] = vcard - + # sanitize embedded markup if is_htmlish and SANITIZE_HTML: if element in self.can_contain_dangerous_markup: @@ -906,7 +906,7 @@ class _FeedParserMixin: if element == 'title' and self.hasTitle: return output - + # store output in appropriate place(s) if self.inentry and not self.insource: if element == 'content': @@ -962,7 +962,7 @@ class _FeedParserMixin: self.incontent -= 1 self.contentparams.clear() return value - + # a number of elements in a number of RSS variants are nominally plain # text, but this is routinely ignored. This is an attempt to detect # the most common cases. As false positives often result in silent @@ -993,7 +993,7 @@ class _FeedParserMixin: prefix = self.namespacemap.get(prefix, prefix) name = prefix + ':' + suffix return name - + def _getAttribute(self, attrsD, name): return attrsD.get(self._mapToStandardPrefix(name)) @@ -1021,7 +1021,7 @@ class _FeedParserMixin: pass attrsD['href'] = href return attrsD - + def _save(self, key, value, overwrite=False): context = self._getContext() if overwrite: @@ -1046,7 +1046,7 @@ class _FeedParserMixin: self.version = 'rss20' else: self.version = 'rss' - + def _start_dlhottitles(self, attrsD): self.version = 'hotrss' @@ -1064,7 +1064,7 @@ class _FeedParserMixin: self._start_link({}) self.elementstack[-1][-1] = attrsD['href'] self._end_link() - + def _start_feed(self, attrsD): self.infeed = 1 versionmap = {'0.1': 'atom01', @@ -1081,7 +1081,7 @@ class _FeedParserMixin: def _end_channel(self): self.infeed = 0 _end_feed = _end_channel - + def _start_image(self, attrsD): context = self._getContext() if not self.inentry: @@ -1089,7 +1089,7 @@ class _FeedParserMixin: self.inimage = 1 self.hasTitle = 0 self.push('image', 0) - + def _end_image(self): self.pop('image') self.inimage = 0 @@ -1101,7 +1101,7 @@ class _FeedParserMixin: self.hasTitle = 0 self.push('textinput', 0) _start_textInput = _start_textinput - + def _end_textinput(self): self.pop('textinput') self.intextinput = 0 @@ -1301,7 +1301,7 @@ class _FeedParserMixin: self.popContent('subtitle') _end_tagline = _end_subtitle _end_itunes_subtitle = _end_subtitle - + def _start_rights(self, attrsD): self.pushContent('rights', attrsD, 'text/plain', 1) _start_dc_rights = _start_rights @@ -1399,7 +1399,7 @@ class _FeedParserMixin: attrsD['rel']='license' if value: attrsD['href']=value context.setdefault('links', []).append(attrsD) - + def _start_creativecommons_license(self, attrsD): self.push('license', 1) _start_creativeCommons_license = _start_creativecommons_license @@ -1420,7 +1420,7 @@ class _FeedParserMixin: value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name}) if value not in xfn: xfn.append(value) - + def _addTag(self, term, scheme, label): context = self._getContext() tags = context.setdefault('tags', []) @@ -1438,7 +1438,7 @@ class _FeedParserMixin: self.push('category', 1) _start_dc_subject = _start_category _start_keywords = _start_category - + def _start_media_category(self, attrsD): attrsD.setdefault('scheme', 'http://search.yahoo.com/mrss/category_schema') self._start_category(attrsD) @@ -1446,11 +1446,11 @@ class _FeedParserMixin: def _end_itunes_keywords(self): for term in self.pop('itunes_keywords').split(): self._addTag(term, 'http://www.itunes.com/', None) - + def _start_itunes_category(self, attrsD): self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None) self.push('category', 1) - + def _end_category(self): value = self.pop('category') if not value: return @@ -1467,7 +1467,7 @@ class _FeedParserMixin: def _start_cloud(self, attrsD): self._getContext()['cloud'] = FeedParserDict(attrsD) - + def _start_link(self, attrsD): attrsD.setdefault('rel', 'alternate') if attrsD['rel'] == 'self': @@ -1568,7 +1568,7 @@ class _FeedParserMixin: context = self._getContext() if context.has_key('generator_detail'): context['generator_detail']['name'] = value - + def _start_admin_generatoragent(self, attrsD): self.push('generator', 1) value = self._getAttribute(attrsD, 'rdf:resource') @@ -1583,7 +1583,7 @@ class _FeedParserMixin: if value: self.elementstack[-1][2].append(value) self.pop('errorreportsto') - + def _start_summary(self, attrsD): context = self._getContext() if context.has_key('summary'): @@ -1601,13 +1601,13 @@ class _FeedParserMixin: self.popContent(self._summaryKey or 'summary') self._summaryKey = None _end_itunes_summary = _end_summary - + def _start_enclosure(self, attrsD): attrsD = self._itsAnHrefDamnIt(attrsD) context = self._getContext() attrsD['rel']='enclosure' context.setdefault('links', []).append(FeedParserDict(attrsD)) - + def _start_source(self, attrsD): if 'url' in attrsD: # This means that we're processing a source element from an RSS 2.0 feed @@ -1659,7 +1659,7 @@ class _FeedParserMixin: if attrsD.get('href'): self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) _start_itunes_link = _start_itunes_image - + def _end_itunes_block(self): value = self.pop('itunes_block', 0) self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0 @@ -1718,12 +1718,12 @@ if _XML_AVAILABLE: self.bozo = 0 self.exc = None self.decls = {} - + def startPrefixMapping(self, prefix, uri): self.trackNamespace(prefix, uri) if uri == 'http://www.w3.org/1999/xlink': self.decls['xmlns:'+prefix] = uri - + def startElementNS(self, name, qname, attrs): namespace, localname = name lowernamespace = str(namespace or '').lower() @@ -1910,7 +1910,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:]) else: self.pieces.append('&#%(ref)s;' % locals()) - + def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' # Reconstruct the original entity reference. @@ -1925,12 +1925,12 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): # Store the original text verbatim. if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_data, text=%s\n' % text) self.pieces.append(text) - + def handle_comment(self, text): # called for each HTML comment, e.g. <!-- insert Javascript code here --> # Reconstruct the original comment. self.pieces.append('<!--%(text)s-->' % locals()) - + def handle_pi(self, text): # called for each processing instruction, e.g. <?instruction> # Reconstruct original processing instruction. @@ -1942,7 +1942,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): # "http://www.w3.org/TR/html4/loose.dtd"> # Reconstruct original DOCTYPE self.pieces.append('<!%(text)s>' % locals()) - + _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match def _scan_name(self, i, declstartpos): rawdata = self.rawdata @@ -1998,7 +1998,7 @@ class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): data = data.replace('"', '"') data = data.replace(''', "'") return data - + def strattrs(self, attrs): return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs]) @@ -2022,12 +2022,12 @@ class _MicroformatsParser: self.enclosures = [] self.xfn = [] self.vcard = None - + def vcardEscape(self, s): if type(s) in (type(''), type(u'')): s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n') return s - + def vcardFold(self, s): s = re.sub(';+$', '', s) sFolded = '' @@ -2043,14 +2043,14 @@ class _MicroformatsParser: def normalize(self, s): return re.sub(r'\s+', ' ', s).strip() - + def unique(self, aList): results = [] for element in aList: if element not in results: results.append(element) return results - + def toISO8601(self, dt): return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt) @@ -2140,21 +2140,21 @@ class _MicroformatsParser: def findVCards(self, elmRoot, bAgentParsing=0): sVCards = '' - + if not bAgentParsing: arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1) else: arCards = [elmRoot] - + for elmCard in arCards: arLines = [] - + def processSingleString(sProperty): sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding) if sValue: arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue)) return sValue or u'' - + def processSingleURI(sProperty): sValue = self.getPropertyValue(elmCard, sProperty, self.URI) if sValue: @@ -2177,7 +2177,7 @@ class _MicroformatsParser: if sContentType: sContentType = ';TYPE=' + sContentType.upper() arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue)) - + def processTypeValue(sProperty, arDefaultType, arForceType=None): arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1) for elmResult in arResults: @@ -2189,7 +2189,7 @@ class _MicroformatsParser: sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0) if sValue: arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue)) - + # AGENT # must do this before all other properties because it is destructive # (removes nested class="vcard" nodes so they don't interfere with @@ -2208,10 +2208,10 @@ class _MicroformatsParser: sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1); if sAgentValue: arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue)) - + # FN (full name) sFN = processSingleString('fn') - + # N (name) elmName = self.getPropertyValue(elmCard, 'n') if elmName: @@ -2237,25 +2237,25 @@ class _MicroformatsParser: arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1])) else: arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0])) - + # SORT-STRING sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1) if sSortString: arLines.append(self.vcardFold('SORT-STRING:' + sSortString)) - + # NICKNAME arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1) if arNickname: arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname))) - + # PHOTO processSingleURI('photo') - + # BDAY dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE) if dtBday: arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday))) - + # ADR (address) arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1) for elmAdr in arAdr: @@ -2277,38 +2277,38 @@ class _MicroformatsParser: sRegion + ';' + sPostalCode + ';' + sCountryName)) - + # LABEL processTypeValue('label', ['intl','postal','parcel','work']) - + # TEL (phone number) processTypeValue('tel', ['voice']) - + # EMAIL processTypeValue('email', ['internet'], ['internet']) - + # MAILER processSingleString('mailer') - + # TZ (timezone) processSingleString('tz') - + # GEO (geographical information) elmGeo = self.getPropertyValue(elmCard, 'geo') if elmGeo: sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1) sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1) arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude)) - + # TITLE processSingleString('title') - + # ROLE processSingleString('role') # LOGO processSingleURI('logo') - + # ORG (organization) elmOrg = self.getPropertyValue(elmCard, 'org') if elmOrg: @@ -2322,39 +2322,39 @@ class _MicroformatsParser: else: arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1) arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit))) - + # CATEGORY arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1) if arCategory: arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory))) - + # NOTE processSingleString('note') - + # REV processSingleString('rev') - + # SOUND processSingleURI('sound') - + # UID processSingleString('uid') - + # URL processSingleURI('url') - + # CLASS processSingleString('class') - + # KEY processSingleURI('key') - + if arLines: arLines = [u'BEGIN:vCard',u'VERSION:3.0'] + arLines + [u'END:vCard'] sVCards += u'\n'.join(arLines) + u'\n' - + return sVCards.strip() - + def isProbablyDownloadable(self, elm): attrsD = elm.attrMap if not attrsD.has_key('href'): return 0 @@ -2453,7 +2453,7 @@ class _RelativeURIResolver(_BaseHTMLProcessor): def resolveURI(self, uri): return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip())) - + def unknown_starttag(self, tag, attrs): if _debug: sys.stderr.write('tag: [%s] with attributes: [%s]\n' % (tag, str(attrs))) @@ -2612,7 +2612,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): self.unacceptablestack = 0 self.mathmlOK = 0 self.svgOK = 0 - + def unknown_starttag(self, tag, attrs): acceptable_attributes = self.acceptable_attributes keymap = {} @@ -2671,7 +2671,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): clean_value = self.sanitize_style(value) if clean_value: clean_attrs.append((key,clean_value)) _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs) - + def unknown_endtag(self, tag): if not tag in self.acceptable_elements: if tag in self.unacceptable_elements_with_end_tag: @@ -2791,7 +2791,7 @@ class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler http_error_300 = http_error_302 http_error_303 = http_error_302 http_error_307 = http_error_302 - + def http_error_401(self, req, fp, code, msg, headers): # Check if # - server requires digest auth, AND @@ -2890,7 +2890,7 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h return opener.open(request) finally: opener.close() # JohnD - + # try to open with native open function (if url_file_stream_or_string is a filename) try: return open(url_file_stream_or_string, 'rb') @@ -2942,7 +2942,7 @@ _date_handlers = [] def registerDateHandler(func): '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' _date_handlers.insert(0, func) - + # ISO-8601 date parsing routines written by Fazal Majid. # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 # parser is beyond the scope of feedparser and would be a worthwhile addition @@ -3055,7 +3055,7 @@ def _parse_date_iso8601(dateString): # Many implementations have bugs, but we'll pretend they don't. return time.localtime(time.mktime(tuple(tm))) registerDateHandler(_parse_date_iso8601) - + # 8-bit date handling routines written by ytrewq1. _korean_year = u'\ub144' # b3e2 in euc-kr _korean_month = u'\uc6d4' # bff9 in euc-kr @@ -3374,7 +3374,7 @@ def _getCharacterEncoding(http_headers, xml_data): http_headers is a dictionary xml_data is a raw string (not Unicode) - + This is so much trickier than it sounds, it's not even funny. According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type is application/xml, application/*+xml, @@ -3393,12 +3393,12 @@ def _getCharacterEncoding(http_headers, xml_data): served with a Content-Type of text/* and no charset parameter must be treated as us-ascii. (We now do this.) And also that it must always be flagged as non-well-formed. (We now do this too.) - + If Content-Type is unspecified (input was local file or non-HTTP source) or unrecognized (server just got it totally wrong), then go by the encoding given in the XML prefix of the document and default to 'iso-8859-1' as per the HTTP specification (RFC 2616). - + Then, assuming we didn't find a character encoding in the HTTP headers (and the HTTP Content-type allowed us to look in the body), we need to sniff the first few bytes of the XML data and try to determine @@ -3508,7 +3508,7 @@ def _getCharacterEncoding(http_headers, xml_data): if true_encoding.lower() == 'gb2312': true_encoding = 'gb18030' return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type - + def _toUTF8(data, encoding): '''Changes an XML data stream on the fly to specify a new encoding @@ -3571,7 +3571,7 @@ def _stripDoctype(data): start = re.search(_s2bytes('<\w'), data) start = start and start.start() or -1 head,data = data[:start+1], data[start+1:] - + entity_pattern = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE) entity_results=entity_pattern.findall(head) head = entity_pattern.sub(_s2bytes(''), head) @@ -3593,10 +3593,10 @@ def _stripDoctype(data): data = doctype_pattern.sub(replacement, head) + data return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)]) - + def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[], request_headers={}, response_headers={}): '''Parse a feed from a URL, file, stream, or string. - + request_headers, if given, is a dict from http header name to value to add to the request; this overrides internally generated values. ''' @@ -3837,7 +3837,7 @@ class TextSerializer(Serializer): stream.write('\n') except: pass - + class PprintSerializer(Serializer): def write(self, stream=sys.stdout): if self.results.has_key('href'): @@ -3845,7 +3845,7 @@ class PprintSerializer(Serializer): from pprint import pprint pprint(self.results, stream) stream.write('\n') - + if __name__ == '__main__': try: from optparse import OptionParser |