diff options
Diffstat (limited to 'mwc.py')
-rwxr-xr-x | mwc.py | 29 |
1 files changed, 20 insertions, 9 deletions
@@ -1,6 +1,6 @@ #!/usr/bin/python3 -# Copyright: (2013-2014) Michael Till Beck <Debianguru@gmx.de> +# Copyright: (2013-2015) Michael Till Beck <Debianguru@gmx.de> # License: GPL-2.0+ import urllib.request, urllib.error, urllib.parse @@ -32,7 +32,7 @@ config = None defaultEncoding = 'utf-8' maxTitleLength = 150 -# this is how an empty feed looks like +# this is how an empty RSS feed looks like emptyfeed = """<?xml version="1.0"?> <rss version="2.0"> <channel> @@ -52,6 +52,8 @@ mailsession = None # translates all relative URIs found in trees to absolute URIs def toAbsoluteURIs(trees, baseuri): for tree in trees: + if isinstance(tree, str): + continue for uriAttribute in uriAttributes: tags = tree.xpath(uriAttribute[0]) for tag in tags: @@ -85,7 +87,10 @@ def parseSite(site): file = process.stdout else: # open website - file = urllib.request.urlopen(uri) + req = urllib.request.Request(uri) + if 'user-agent' in site: + req.add_header('User-Agent', site['user-agent']) + file = urllib.request.urlopen(req) if contenttype == 'text' or (contentxpath == '' and titlexpath == ''): @@ -126,8 +131,14 @@ def parseSite(site): if len(titleresult) == 0: titleresult = contentresult - contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult] - titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult] + if isinstance(contentresult, str): + contents = [contentresult] + else: + contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult] + if isinstance(titleresult, str): + titles = [getSubject(titleresult)] + else: + titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult] except IOError as e: warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e) @@ -227,8 +238,8 @@ def getFileContents(shortname): result = [] for f in os.listdir('.'): if f.startswith(shortname + '.') and f.endswith('.txt'): - file = open(f, 'r') - result.append(file.read()) + file = open(f, 'rb') + result.append(file.read().decode('utf-8')) file.close() return result @@ -241,8 +252,8 @@ def storeFileContents(shortname, parseResult): i = 0 for c in parseResult['contents']: - file = open(shortname + '.' + str(i) + '.txt', 'w') - file.write(c) + file = open(shortname + '.' + str(i) + '.txt', 'wb') + file.write(c.encode('utf-8')) file.close() i += 1 |