summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md10
-rwxr-xr-xmwc.py29
2 files changed, 27 insertions, 12 deletions
diff --git a/README.md b/README.md
index 7419ad2..6c6d731 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,8 @@ sites = [
{'shortname': 'mywebsite3',
'uri': 'http://www.mywebsite3.com/info',
'type': 'text',
- 'contentregex': 'Version\"\:\d*\.\d*'}
+ 'contentregex': 'Version\"\:\d*\.\d*',
+ 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'}
]
</code>
@@ -53,8 +54,11 @@ sites = [
Regular expression. If XPath/CSS selector is defined, the regular expression is applied afterwards.
* <b>encoding</b> (optional; default: 'utf-8')
Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'.
- * <b>receiver</b> (optional)
- Overwrites global receiver specification.
+ * <b>receiver</b> (optional)
+ Overrides global receiver specification.
+ * <b>user-agent</b> (optional)
+ Defines the user agent string, e.g.,
+ 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'
* We collect some XPath/CSS snippets at this place: <a href="https://github.com/Debianguru/MailWebsiteChanges/wiki/snippets">Snippet collection</a> - please feel free to add your own definitions!
diff --git a/mwc.py b/mwc.py
index 5d7278e..606f504 100755
--- a/mwc.py
+++ b/mwc.py
@@ -1,6 +1,6 @@
#!/usr/bin/python3
-# Copyright: (2013-2014) Michael Till Beck <Debianguru@gmx.de>
+# Copyright: (2013-2015) Michael Till Beck <Debianguru@gmx.de>
# License: GPL-2.0+
import urllib.request, urllib.error, urllib.parse
@@ -32,7 +32,7 @@ config = None
defaultEncoding = 'utf-8'
maxTitleLength = 150
-# this is how an empty feed looks like
+# this is how an empty RSS feed looks like
emptyfeed = """<?xml version="1.0"?>
<rss version="2.0">
<channel>
@@ -52,6 +52,8 @@ mailsession = None
# translates all relative URIs found in trees to absolute URIs
def toAbsoluteURIs(trees, baseuri):
for tree in trees:
+ if isinstance(tree, str):
+ continue
for uriAttribute in uriAttributes:
tags = tree.xpath(uriAttribute[0])
for tag in tags:
@@ -85,7 +87,10 @@ def parseSite(site):
file = process.stdout
else:
# open website
- file = urllib.request.urlopen(uri)
+ req = urllib.request.Request(uri)
+ if 'user-agent' in site:
+ req.add_header('User-Agent', site['user-agent'])
+ file = urllib.request.urlopen(req)
if contenttype == 'text' or (contentxpath == '' and titlexpath == ''):
@@ -126,8 +131,14 @@ def parseSite(site):
if len(titleresult) == 0:
titleresult = contentresult
- contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult]
- titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult]
+ if isinstance(contentresult, str):
+ contents = [contentresult]
+ else:
+ contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult]
+ if isinstance(titleresult, str):
+ titles = [getSubject(titleresult)]
+ else:
+ titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult]
except IOError as e:
warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e)
@@ -227,8 +238,8 @@ def getFileContents(shortname):
result = []
for f in os.listdir('.'):
if f.startswith(shortname + '.') and f.endswith('.txt'):
- file = open(f, 'r')
- result.append(file.read())
+ file = open(f, 'rb')
+ result.append(file.read().decode('utf-8'))
file.close()
return result
@@ -241,8 +252,8 @@ def storeFileContents(shortname, parseResult):
i = 0
for c in parseResult['contents']:
- file = open(shortname + '.' + str(i) + '.txt', 'w')
- file.write(c)
+ file = open(shortname + '.' + str(i) + '.txt', 'wb')
+ file.write(c.encode('utf-8'))
file.close()
i += 1