From 0e1e73bdd5ba9ca6fcbfec9dd46060832a0104ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= <debian@jff-webhosting.net>
Date: Mon, 8 May 2017 09:22:44 +0200
Subject: New upstream version 1.7.4

---
 README.md | 10 +++++++---
 mwc.py    | 29 ++++++++++++++++++++---------
 2 files changed, 27 insertions(+), 12 deletions(-)
diff --git a/README.md b/README.md
index 7419ad2..6c6d731 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,8 @@ sites = [
           {'shortname': 'mywebsite3',
            'uri': 'http://www.mywebsite3.com/info',
            'type': 'text',
-           'contentregex': 'Version\"\:\d*\.\d*'}
+           'contentregex': 'Version\"\:\d*\.\d*',
+           'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'}
 
 ]
 </code>
@@ -53,8 +54,11 @@ sites = [
      Regular expression. If XPath/CSS selector is defined, the regular expression is applied afterwards.
    * <b>encoding</b> (optional; default: 'utf-8')  
      Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'.
-   * <b>receiver</b> (optional)
-     Overwrites global receiver specification.
+   * <b>receiver</b> (optional)  
+     Overrides global receiver specification.
+   * <b>user-agent</b> (optional)  
+     Defines the user agent string, e.g.,  
+     'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'
 
 
  * We collect some XPath/CSS snippets at this place: <a href="https://github.com/Debianguru/MailWebsiteChanges/wiki/snippets">Snippet collection</a> - please feel free to add your own definitions!
diff --git a/mwc.py b/mwc.py
index 5d7278e..606f504 100755
--- a/mwc.py
+++ b/mwc.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python3
 
-# Copyright: (2013-2014) Michael Till Beck <Debianguru@gmx.de>
+# Copyright: (2013-2015) Michael Till Beck <Debianguru@gmx.de>
 # License: GPL-2.0+
 
 import urllib.request, urllib.error, urllib.parse
@@ -32,7 +32,7 @@ config = None
 defaultEncoding = 'utf-8'
 maxTitleLength = 150
 
-# this is how an empty feed looks like
+# this is how an empty RSS feed looks like
 emptyfeed = """<?xml version="1.0"?>
 <rss version="2.0">
  <channel>
@@ -52,6 +52,8 @@ mailsession = None
 # translates all relative URIs found in trees to absolute URIs
 def toAbsoluteURIs(trees, baseuri):
         for tree in trees:
+                if isinstance(tree, str):
+                        continue
                 for uriAttribute in uriAttributes:
                         tags = tree.xpath(uriAttribute[0])
                         for tag in tags:
@@ -85,7 +87,10 @@ def parseSite(site):
                         file = process.stdout
                 else:
                         # open website
-                        file = urllib.request.urlopen(uri)
+                        req = urllib.request.Request(uri)
+                        if 'user-agent' in site:
+                            req.add_header('User-Agent', site['user-agent'])
+                        file = urllib.request.urlopen(req)
 
 
                 if contenttype == 'text' or (contentxpath == '' and titlexpath == ''):
@@ -126,8 +131,14 @@ def parseSite(site):
                                 if len(titleresult) == 0:
                                         titleresult = contentresult
 
-                        contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult]
-                        titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult]
+                        if isinstance(contentresult, str):
+                                contents = [contentresult]
+                        else:
+                                contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult]
+                        if isinstance(titleresult, str):
+                                titles = [getSubject(titleresult)]
+                        else:
+                                titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult]
 
         except IOError as e:
                 warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e)
@@ -227,8 +238,8 @@ def getFileContents(shortname):
         result = []
         for f in os.listdir('.'):
                 if f.startswith(shortname + '.') and f.endswith('.txt'):
-                        file = open(f, 'r')
-                        result.append(file.read())
+                        file = open(f, 'rb')
+                        result.append(file.read().decode('utf-8'))
                         file.close()
         return result
 
@@ -241,8 +252,8 @@ def storeFileContents(shortname, parseResult):
 
         i = 0
         for c in parseResult['contents']:
-                file = open(shortname + '.' + str(i) + '.txt', 'w')
-                file.write(c)
+                file = open(shortname + '.' + str(i) + '.txt', 'wb')
+                file.write(c.encode('utf-8'))
                 file.close()
                 i += 1
 
-- 
cgit v1.2.3