diff options
-rw-r--r-- | README.md | 10 | ||||
-rwxr-xr-x | mwc.py | 29 |
2 files changed, 27 insertions, 12 deletions
@@ -31,7 +31,8 @@ sites = [ {'shortname': 'mywebsite3', 'uri': 'http://www.mywebsite3.com/info', 'type': 'text', - 'contentregex': 'Version\"\:\d*\.\d*'} + 'contentregex': 'Version\"\:\d*\.\d*', + 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'} ] </code> @@ -53,8 +54,11 @@ sites = [ Regular expression. If XPath/CSS selector is defined, the regular expression is applied afterwards. * <b>encoding</b> (optional; default: 'utf-8') Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'. - * <b>receiver</b> (optional) - Overwrites global receiver specification. + * <b>receiver</b> (optional) + Overrides global receiver specification. + * <b>user-agent</b> (optional) + Defines the user agent string, e.g., + 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0' * We collect some XPath/CSS snippets at this place: <a href="https://github.com/Debianguru/MailWebsiteChanges/wiki/snippets">Snippet collection</a> - please feel free to add your own definitions! @@ -1,6 +1,6 @@ #!/usr/bin/python3 -# Copyright: (2013-2014) Michael Till Beck <Debianguru@gmx.de> +# Copyright: (2013-2015) Michael Till Beck <Debianguru@gmx.de> # License: GPL-2.0+ import urllib.request, urllib.error, urllib.parse @@ -32,7 +32,7 @@ config = None defaultEncoding = 'utf-8' maxTitleLength = 150 -# this is how an empty feed looks like +# this is how an empty RSS feed looks like emptyfeed = """<?xml version="1.0"?> <rss version="2.0"> <channel> @@ -52,6 +52,8 @@ mailsession = None # translates all relative URIs found in trees to absolute URIs def toAbsoluteURIs(trees, baseuri): for tree in trees: + if isinstance(tree, str): + continue for uriAttribute in uriAttributes: tags = tree.xpath(uriAttribute[0]) for tag in tags: @@ -85,7 +87,10 @@ def parseSite(site): file = process.stdout else: # open website - file = urllib.request.urlopen(uri) + req = urllib.request.Request(uri) + if 'user-agent' in site: + req.add_header('User-Agent', site['user-agent']) + file = urllib.request.urlopen(req) if contenttype == 'text' or (contentxpath == '' and titlexpath == ''): @@ -126,8 +131,14 @@ def parseSite(site): if len(titleresult) == 0: titleresult = contentresult - contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult] - titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult] + if isinstance(contentresult, str): + contents = [contentresult] + else: + contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult] + if isinstance(titleresult, str): + titles = [getSubject(titleresult)] + else: + titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult] except IOError as e: warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e) @@ -227,8 +238,8 @@ def getFileContents(shortname): result = [] for f in os.listdir('.'): if f.startswith(shortname + '.') and f.endswith('.txt'): - file = open(f, 'r') - result.append(file.read()) + file = open(f, 'rb') + result.append(file.read().decode('utf-8')) file.close() return result @@ -241,8 +252,8 @@ def storeFileContents(shortname, parseResult): i = 0 for c in parseResult['contents']: - file = open(shortname + '.' + str(i) + '.txt', 'w') - file.write(c) + file = open(shortname + '.' + str(i) + '.txt', 'wb') + file.write(c.encode('utf-8')) file.close() i += 1 |