From 0e1e73bdd5ba9ca6fcbfec9dd46060832a0104ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Mon, 8 May 2017 09:22:44 +0200 Subject: New upstream version 1.7.4 --- README.md | 10 +++++++--- mwc.py | 29 ++++++++++++++++++++--------- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 7419ad2..6c6d731 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,8 @@ sites = [ {'shortname': 'mywebsite3', 'uri': 'http://www.mywebsite3.com/info', 'type': 'text', - 'contentregex': 'Version\"\:\d*\.\d*'} + 'contentregex': 'Version\"\:\d*\.\d*', + 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'} ] @@ -53,8 +54,11 @@ sites = [ Regular expression. If XPath/CSS selector is defined, the regular expression is applied afterwards. * encoding (optional; default: 'utf-8') Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'. - * receiver (optional) - Overwrites global receiver specification. + * receiver (optional) + Overrides global receiver specification. + * user-agent (optional) + Defines the user agent string, e.g., + 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0' * We collect some XPath/CSS snippets at this place: Snippet collection - please feel free to add your own definitions! diff --git a/mwc.py b/mwc.py index 5d7278e..606f504 100755 --- a/mwc.py +++ b/mwc.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 -# Copyright: (2013-2014) Michael Till Beck +# Copyright: (2013-2015) Michael Till Beck # License: GPL-2.0+ import urllib.request, urllib.error, urllib.parse @@ -32,7 +32,7 @@ config = None defaultEncoding = 'utf-8' maxTitleLength = 150 -# this is how an empty feed looks like +# this is how an empty RSS feed looks like emptyfeed = """ @@ -52,6 +52,8 @@ mailsession = None # translates all relative URIs found in trees to absolute URIs def toAbsoluteURIs(trees, baseuri): for tree in trees: + if isinstance(tree, str): + continue for uriAttribute in uriAttributes: tags = tree.xpath(uriAttribute[0]) for tag in tags: @@ -85,7 +87,10 @@ def parseSite(site): file = process.stdout else: # open website - file = urllib.request.urlopen(uri) + req = urllib.request.Request(uri) + if 'user-agent' in site: + req.add_header('User-Agent', site['user-agent']) + file = urllib.request.urlopen(req) if contenttype == 'text' or (contentxpath == '' and titlexpath == ''): @@ -126,8 +131,14 @@ def parseSite(site): if len(titleresult) == 0: titleresult = contentresult - contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult] - titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult] + if isinstance(contentresult, str): + contents = [contentresult] + else: + contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult] + if isinstance(titleresult, str): + titles = [getSubject(titleresult)] + else: + titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult] except IOError as e: warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e) @@ -227,8 +238,8 @@ def getFileContents(shortname): result = [] for f in os.listdir('.'): if f.startswith(shortname + '.') and f.endswith('.txt'): - file = open(f, 'r') - result.append(file.read()) + file = open(f, 'rb') + result.append(file.read().decode('utf-8')) file.close() return result @@ -241,8 +252,8 @@ def storeFileContents(shortname, parseResult): i = 0 for c in parseResult['contents']: - file = open(shortname + '.' + str(i) + '.txt', 'w') - file.write(c) + file = open(shortname + '.' + str(i) + '.txt', 'wb') + file.write(c.encode('utf-8')) file.close() i += 1 -- cgit v1.2.3