summaryrefslogtreecommitdiff
path: root/mwc.py
diff options
context:
space:
mode:
Diffstat (limited to 'mwc.py')
-rwxr-xr-xmwc.py29
1 files changed, 20 insertions, 9 deletions
diff --git a/mwc.py b/mwc.py
index 5d7278e..606f504 100755
--- a/mwc.py
+++ b/mwc.py
@@ -1,6 +1,6 @@
#!/usr/bin/python3
-# Copyright: (2013-2014) Michael Till Beck <Debianguru@gmx.de>
+# Copyright: (2013-2015) Michael Till Beck <Debianguru@gmx.de>
# License: GPL-2.0+
import urllib.request, urllib.error, urllib.parse
@@ -32,7 +32,7 @@ config = None
defaultEncoding = 'utf-8'
maxTitleLength = 150
-# this is how an empty feed looks like
+# this is how an empty RSS feed looks like
emptyfeed = """<?xml version="1.0"?>
<rss version="2.0">
<channel>
@@ -52,6 +52,8 @@ mailsession = None
# translates all relative URIs found in trees to absolute URIs
def toAbsoluteURIs(trees, baseuri):
for tree in trees:
+ if isinstance(tree, str):
+ continue
for uriAttribute in uriAttributes:
tags = tree.xpath(uriAttribute[0])
for tag in tags:
@@ -85,7 +87,10 @@ def parseSite(site):
file = process.stdout
else:
# open website
- file = urllib.request.urlopen(uri)
+ req = urllib.request.Request(uri)
+ if 'user-agent' in site:
+ req.add_header('User-Agent', site['user-agent'])
+ file = urllib.request.urlopen(req)
if contenttype == 'text' or (contentxpath == '' and titlexpath == ''):
@@ -126,8 +131,14 @@ def parseSite(site):
if len(titleresult) == 0:
titleresult = contentresult
- contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult]
- titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult]
+ if isinstance(contentresult, str):
+ contents = [contentresult]
+ else:
+ contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult]
+ if isinstance(titleresult, str):
+ titles = [getSubject(titleresult)]
+ else:
+ titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult]
except IOError as e:
warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e)
@@ -227,8 +238,8 @@ def getFileContents(shortname):
result = []
for f in os.listdir('.'):
if f.startswith(shortname + '.') and f.endswith('.txt'):
- file = open(f, 'r')
- result.append(file.read())
+ file = open(f, 'rb')
+ result.append(file.read().decode('utf-8'))
file.close()
return result
@@ -241,8 +252,8 @@ def storeFileContents(shortname, parseResult):
i = 0
for c in parseResult['contents']:
- file = open(shortname + '.' + str(i) + '.txt', 'w')
- file.write(c)
+ file = open(shortname + '.' + str(i) + '.txt', 'wb')
+ file.write(c.encode('utf-8'))
file.close()
i += 1