From 0e1e73bdd5ba9ca6fcbfec9dd46060832a0104ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Mon, 8 May 2017 09:22:44 +0200 Subject: New upstream version 1.7.4 --- mwc.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'mwc.py') diff --git a/mwc.py b/mwc.py index 5d7278e..606f504 100755 --- a/mwc.py +++ b/mwc.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 -# Copyright: (2013-2014) Michael Till Beck +# Copyright: (2013-2015) Michael Till Beck # License: GPL-2.0+ import urllib.request, urllib.error, urllib.parse @@ -32,7 +32,7 @@ config = None defaultEncoding = 'utf-8' maxTitleLength = 150 -# this is how an empty feed looks like +# this is how an empty RSS feed looks like emptyfeed = """ @@ -52,6 +52,8 @@ mailsession = None # translates all relative URIs found in trees to absolute URIs def toAbsoluteURIs(trees, baseuri): for tree in trees: + if isinstance(tree, str): + continue for uriAttribute in uriAttributes: tags = tree.xpath(uriAttribute[0]) for tag in tags: @@ -85,7 +87,10 @@ def parseSite(site): file = process.stdout else: # open website - file = urllib.request.urlopen(uri) + req = urllib.request.Request(uri) + if 'user-agent' in site: + req.add_header('User-Agent', site['user-agent']) + file = urllib.request.urlopen(req) if contenttype == 'text' or (contentxpath == '' and titlexpath == ''): @@ -126,8 +131,14 @@ def parseSite(site): if len(titleresult) == 0: titleresult = contentresult - contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult] - titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult] + if isinstance(contentresult, str): + contents = [contentresult] + else: + contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult] + if isinstance(titleresult, str): + titles = [getSubject(titleresult)] + else: + titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult] except IOError as e: warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e) @@ -227,8 +238,8 @@ def getFileContents(shortname): result = [] for f in os.listdir('.'): if f.startswith(shortname + '.') and f.endswith('.txt'): - file = open(f, 'r') - result.append(file.read()) + file = open(f, 'rb') + result.append(file.read().decode('utf-8')) file.close() return result @@ -241,8 +252,8 @@ def storeFileContents(shortname, parseResult): i = 0 for c in parseResult['contents']: - file = open(shortname + '.' + str(i) + '.txt', 'w') - file.write(c) + file = open(shortname + '.' + str(i) + '.txt', 'wb') + file.write(c.encode('utf-8')) file.close() i += 1 -- cgit v1.2.3