' + subject + '

From 76ea31d1747d8d95ec7ac75be750176beb452f66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Sun, 6 Aug 2017 19:52:14 +0200 Subject: New upstream version 1.7.6 --- README.md | 3 +++ config_template.py | 3 ++- mwc.py | 23 +++++++++++++++++++---- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d008527..8e78da6 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,8 @@ sites = [ Regular expression. If XPath/CSS selector is defined, the regular expression is applied afterwards. * encoding (optional; default: 'utf-8') Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'. + * splitregex (optional) + only works if type is set to 'text'; defines that content should be split to chunks based on the defined regex expression. * receiver (optional) Overrides global receiver specification. * user-agent (optional) @@ -79,6 +81,7 @@ sites = [
 
 enableMailNotifications = True   #enable/disable notification messages; if set to False, only send error messages
+maxMailsPerSession = -1   #max. number of mails to send per session; ignored when set to -1
 subjectPostfix = 'A website has been updated!'
 
 sender = 'me@mymail.com'
diff --git a/config_template.py b/config_template.py
index 02f7579..f394e52 100644
--- a/config_template.py
+++ b/config_template.py
@@ -15,7 +15,7 @@ sites = [
            'titleregex': '',
            'contentregex': '',
            'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0',
-           'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+           'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
            'encoding': 'utf-8'},
 
           {'shortname': 'mywebsite2',
@@ -42,6 +42,7 @@ sites = [
 subjectPostfix = 'A website has been updated!'
 
 enableMailNotifications = True
+maxMailsPerSession = -1
 sender = 'me@mymail.com'
 smtphost = 'mysmtpprovider.com'
 useTLS = True
diff --git a/mwc.py b/mwc.py
index a0635a1..c420a74 100755
--- a/mwc.py
+++ b/mwc.py
@@ -69,6 +69,7 @@ def parseSite(site):
         contenttype = site.get('type', 'html')
         contentregex = site.get('contentregex', '')
         titleregex = site.get('titleregex', '')
+        splitregex = site.get('splitregex', '')
         enc = site.get('encoding', defaultEncoding)
 
         contentxpath = site.get('contentxpath', '')
@@ -96,7 +97,10 @@ def parseSite(site):
 
 
                 if contenttype == 'text' or (contentxpath == '' and titlexpath == ''):
-                        contents = [file.read().decode(enc)]
+                        thefullcontent = file.read().decode(enc)
+                        contents = [thefullcontent]
+                        if splitregex != '':
+                                contents = thefullcontent.split(splitregex)
                         titles = []
                 else:
                         baseuri = uri
@@ -248,13 +252,13 @@ def getFileContents(shortname):
 
 
 # updates list of content that is stored locally for a specific site
-def storeFileContents(shortname, parseResult):
+def storeFileContents(shortname, contents):
         for f in os.listdir('.'):
                 if f.startswith(shortname + '.') and f.endswith('.txt'):
                         os.remove(f)
 
         i = 0
-        for c in parseResult['contents']:
+        for c in contents:
                 file = open(shortname + '.' + str(i) + '.txt', 'wb')
                 file.write(c.encode('utf-8'))
                 file.close()
@@ -271,7 +275,11 @@ def pollWebsites():
                         feedXML = etree.parse(io.StringIO(emptyfeed))
 
         # start polling sites
+        sessionContents = []
+        mailsSent = 0
         for site in config.sites:
+                if config.maxMailsPerSession != -1 and mailsSent >= config.maxMailsPerSession:
+                        break
 
                 print('polling site [' + site['shortname'] + '] ...')
                 parseResult = parseSite(site)
@@ -283,6 +291,7 @@ def pollWebsites():
                         print('WARNING: ' + parseResult['warning'])
                         if config.enableMailNotifications:
                                 sendmail(receiver, subject, parseResult['warning'], False, None)
+                                mailsSent = mailsSent + 1
                         if config.enableRSSFeed:
                                 feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0))
                 else:
@@ -291,13 +300,18 @@ def pollWebsites():
                         fileContents = getFileContents(site['shortname'])
                         i = 0
                         for content in parseResult['contents']:
+                                if config.maxMailsPerSession != -1 and mailsSent >= config.maxMailsPerSession:
+                                        break
+
                                 if content not in fileContents:
                                         changes += 1
+                                        sessionContents.append(content)
 
                                         subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i]
                                         print('    ' + subject)
                                         if config.enableMailNotifications and len(fileContents) > 0:
                                                 sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri'])
+                                                mailsSent = mailsSent + 1
 
                                         if config.enableRSSFeed:
                                                 feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes))
@@ -305,7 +319,7 @@ def pollWebsites():
 
 
                         if changes > 0:
-                                storeFileContents(site['shortname'], parseResult)
+                                storeFileContents(site['shortname'], sessionContents)
                                 print('        ' + str(changes) + ' updates')
  
         # store feed
@@ -343,6 +357,7 @@ if __name__ == "__main__":
                         if site['shortname'] == dryrun:
                                 parseResult = parseSite(site)
                                 print(parseResult)
+                                print(str(len(parseResult['contents'])) + " results")
                                 break
         else:
                 try:
-- 
cgit v1.2.3


From e121929e4aca8e1ce8167d5c7e0661ac48d6327d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= 
Date: Sun, 6 Aug 2017 19:55:54 +0200
Subject: New upstream release

---
 debian/changelog | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/debian/changelog b/debian/changelog
index 48ee0e8..44b9c3b 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,4 +1,4 @@
-mwc (1.7.5-1) UNRELEASED; urgency=medium
+mwc (1.7.6-1) UNRELEASED; urgency=medium
 
   * New upstream release.
   * Renumbering patches.
@@ -13,7 +13,7 @@ mwc (1.7.5-1) UNRELEASED; urgency=medium
   * debian/copyright:
     - Refresh copyright year at * and debian/*.
 
- -- Jörg Frings-Fürst   Tue, 18 Apr 2017 11:06:04 +0200
+ -- Jörg Frings-Fürst   Sun, 06 Aug 2017 19:52:54 +0200
 
 mwc (1.7.2-3) unstable; urgency=medium
 
-- 
cgit v1.2.3


From 2aa6f5ebd42d98f0d9a3b7d94076ac9b6ee1ba45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= 
Date: Sun, 6 Aug 2017 20:16:36 +0200
Subject: New README.source to explain the branching model used; Declare
 compliance with Debian Policy 4.0.0

---
 .pc/.dpkg-source-unapply |  0
 .pc/.quilt_patches       |  1 +
 .pc/.quilt_series        |  1 +
 .pc/.version             |  1 +
 debian/README.source     | 18 ++++++++++++++++++
 debian/changelog         |  3 ++-
 debian/control           |  2 +-
 7 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 .pc/.dpkg-source-unapply
 create mode 100644 .pc/.quilt_patches
 create mode 100644 .pc/.quilt_series
 create mode 100644 .pc/.version
 create mode 100644 debian/README.source

diff --git a/.pc/.dpkg-source-unapply b/.pc/.dpkg-source-unapply
new file mode 100644
index 0000000..e69de29
diff --git a/.pc/.quilt_patches b/.pc/.quilt_patches
new file mode 100644
index 0000000..6857a8d
--- /dev/null
+++ b/.pc/.quilt_patches
@@ -0,0 +1 @@
+debian/patches
diff --git a/.pc/.quilt_series b/.pc/.quilt_series
new file mode 100644
index 0000000..c206706
--- /dev/null
+++ b/.pc/.quilt_series
@@ -0,0 +1 @@
+series
diff --git a/.pc/.version b/.pc/.version
new file mode 100644
index 0000000..0cfbf08
--- /dev/null
+++ b/.pc/.version
@@ -0,0 +1 @@
+2
diff --git a/debian/README.source b/debian/README.source
new file mode 100644
index 0000000..e4f2b3d
--- /dev/null
+++ b/debian/README.source
@@ -0,0 +1,18 @@
+Hello,
+
+now I use the branching model from Vincent Driessen[1].
+
+I use the gitflow-avh[2]. with the Documentation[3].
+The Debian package can be found here[4].
+
+Please upload unattended uploads use a branch feature/.
+
+
+Many thanks.
+
+ -- Jörg Frings-Fürst   Fri, 02 Jun 2017 19:00:40 +0200
+
+[1] http://nvie.com/posts/a-successful-git-branching-model/
+[2] https://github.com/petervanderdoes/gitflow-avh
+[3] https://github.com/petervanderdoes/gitflow-avh/wiki
+[4] https://tracker.debian.org/pkg/git-flow
diff --git a/debian/changelog b/debian/changelog
index 44b9c3b..18767a8 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -5,13 +5,14 @@ mwc (1.7.6-1) UNRELEASED; urgency=medium
   * debian/patches/0105-try_mail.diff:
     - Replace undefined printf with print (Closes: #860494).
   * Rewrite debian/watch for archives without "v" in front of the version.
-  * Bump Standards-Version to 3.9.8.
+  * Declare compliance with Debian Policy 4.0.0. (No changes needed).
   * Bump compatlevel to 10 (no changes required):
     - Change debian/compat to 10.
     - At debian/control change requested version of debhelper to >= 10.
   * At debian/control change Vcs-Browser to secure URI.
   * debian/copyright:
     - Refresh copyright year at * and debian/*.
+  * New README.source to explain the branching model used.
 
  -- Jörg Frings-Fürst   Sun, 06 Aug 2017 19:52:54 +0200
 
diff --git a/debian/control b/debian/control
index 70dd2d3..f38df7d 100644
--- a/debian/control
+++ b/debian/control
@@ -6,7 +6,7 @@ Build-Depends:
  debhelper (>= 10),
  dh-python,
  python3-all
-Standards-Version: 3.9.8
+Standards-Version: 4.0.0
 Homepage: https://github.com/Debianguru/MailWebsiteChanges
 Vcs-Git: git://anonscm.debian.org/collab-maint/mwc.git
 Vcs-Browser: https://anonscm.debian.org/cgit/collab-maint/mwc.git
-- 
cgit v1.2.3


From 6c03e9d2fa808b9c5a223c4d01f4d0b848fe97f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= 
Date: Sun, 6 Aug 2017 20:17:43 +0200
Subject: Add .pc to .gitignore

---
 .gitignore               | 1 +
 .pc/.dpkg-source-unapply | 0
 .pc/.quilt_patches       | 1 -
 .pc/.quilt_series        | 1 -
 .pc/.version             | 1 -
 5 files changed, 1 insertion(+), 3 deletions(-)
 delete mode 100644 .pc/.dpkg-source-unapply
 delete mode 100644 .pc/.quilt_patches
 delete mode 100644 .pc/.quilt_series
 delete mode 100644 .pc/.version

diff --git a/.gitignore b/.gitignore
index 4ccd411..f945b3b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,5 @@
 /*.txt
 *~
 *.pyc
+.pc
 
diff --git a/.pc/.dpkg-source-unapply b/.pc/.dpkg-source-unapply
deleted file mode 100644
index e69de29..0000000
diff --git a/.pc/.quilt_patches b/.pc/.quilt_patches
deleted file mode 100644
index 6857a8d..0000000
--- a/.pc/.quilt_patches
+++ /dev/null
@@ -1 +0,0 @@
-debian/patches
diff --git a/.pc/.quilt_series b/.pc/.quilt_series
deleted file mode 100644
index c206706..0000000
--- a/.pc/.quilt_series
+++ /dev/null
@@ -1 +0,0 @@
-series
diff --git a/.pc/.version b/.pc/.version
deleted file mode 100644
index 0cfbf08..0000000
--- a/.pc/.version
+++ /dev/null
@@ -1 +0,0 @@
-2
-- 
cgit v1.2.3


From f8f939634396158de53fb26fa7f9a539a92fb219 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= 
Date: Fri, 11 Aug 2017 04:42:19 +0200
Subject: New upstream version 1.8.2

---
 mwc.py | 574 ++++++++++++++++++++++++++++++++---------------------------------
 1 file changed, 287 insertions(+), 287 deletions(-)

diff --git a/mwc.py b/mwc.py
index c420a74..4df4799 100755
--- a/mwc.py
+++ b/mwc.py
@@ -1,4 +1,5 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
 
 # Copyright: (2013-2015) Michael Till Beck 
 # License: GPL-2.0+
@@ -9,6 +10,7 @@ from lxml import etree
 from cssselect import GenericTranslator
 import re
 import io
+import hashlib
 
 import smtplib
 from email.mime.text import MIMEText
@@ -51,324 +53,322 @@ mailsession = None
 
 # translates all relative URIs found in trees to absolute URIs
 def toAbsoluteURIs(trees, baseuri):
-        for tree in trees:
-                if isinstance(tree, str):
-                        continue
-                for uriAttribute in uriAttributes:
-                        tags = tree.xpath(uriAttribute[0])
-                        for tag in tags:
-                                if tag.attrib.get(uriAttribute[1]) != None:
-                                        if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '':
-                                                tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]])
+    for tree in trees:
+        if isinstance(tree, str):
+            continue
+        for uriAttribute in uriAttributes:
+            tags = tree.xpath(uriAttribute[0])
+            for tag in tags:
+                if tag.attrib.get(uriAttribute[1]) != None:
+                    if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '':
+                        tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]])
 
 
 def parseSite(site):
-        file, content, titles, warning = None, None, None, None
-
-        uri = site['uri']
-        contenttype = site.get('type', 'html')
-        contentregex = site.get('contentregex', '')
-        titleregex = site.get('titleregex', '')
-        splitregex = site.get('splitregex', '')
-        enc = site.get('encoding', defaultEncoding)
-
-        contentxpath = site.get('contentxpath', '')
-        if contentxpath == '' and site.get('contentcss', '') != '':
-                # CSS
-                contentxpath = GenericTranslator().css_to_xpath(site.get('contentcss'))
-        titlexpath = site.get('titlexpath', '')
-        if titlexpath == '' and site.get('titlecss', '') != '':
-                titlexpath = GenericTranslator().css_to_xpath(site.get('titlecss'))
-
-        try:
-
-                if uri.startswith(cmdscheme):
-                        # run command and retrieve output
-                        process = subprocess.Popen(uri[len(cmdscheme):], stdout=subprocess.PIPE, shell=True, close_fds=True)
-                        file = process.stdout
-                else:
-                        # open website
-                        req = urllib.request.Request(uri)
-                        if 'user-agent' in site:
-                            req.add_header('User-Agent', site['user-agent'])
-                        if 'accept' in site:
-                            req.add_header('Accept', site['accept'])
-                        file = urllib.request.urlopen(req)
-
-
-                if contenttype == 'text' or (contentxpath == '' and titlexpath == ''):
-                        thefullcontent = file.read().decode(enc)
-                        contents = [thefullcontent]
-                        if splitregex != '':
-                                contents = thefullcontent.split(splitregex)
-                        titles = []
-                else:
-                        baseuri = uri
-                        if contenttype == 'html':
-                                parser = etree.HTMLParser(encoding=enc)
-                        else:
-                                parser = etree.XMLParser(recover=True, encoding=enc)
-
-                        tree = etree.parse(file, parser)
-
-                        # xpath
-                        contentresult = tree.xpath(contentxpath) if contentxpath else []
-                        titleresult = tree.xpath(titlexpath) if titlexpath else []
-
-                        # translate relative URIs to absolute URIs
-                        if contenttype == 'html':
-                                basetaglist = tree.xpath('/html/head/base')
-                                if len(basetaglist) != 0:
-                                        baseuri = basetaglist[0].attrib['href']
-                                if len(contentresult) != 0:
-                                        toAbsoluteURIs(contentresult, baseuri)
-                                if len(titleresult) != 0:
-                                        toAbsoluteURIs(titleresult, baseuri)
-
-                        if contentxpath != '' and titlexpath != '' and len(contentresult) != len(titleresult):
-                                warning = 'WARNING: number of title blocks (' + str(len(titleresult)) + ') does not match number of content blocks (' + str(len(contentresult)) + ')'
-                        elif contentxpath and len(contentresult) == 0:
-                                warning = 'WARNING: content selector became invalid!'
-                        elif titlexpath and len(titleresult) == 0:
-                                warning = 'WARNING: title selector became invalid!'
-                        else:
-                                if len(contentresult) == 0:
-                                        contentresult = titleresult
-                                if len(titleresult) == 0:
-                                        titleresult = contentresult
-
-                        if isinstance(contentresult, str):
-                                contents = [contentresult]
-                        else:
-                                contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult]
-                        if isinstance(titleresult, str):
-                                titles = [getSubject(titleresult)]
-                        else:
-                                titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult]
-
-        except IOError as e:
-                warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e)
-
-        if file is not None:
-                file.close()
-
-        if uri.startswith(cmdscheme) and process.wait() != 0:
-                warning = 'WARNING: process terminated with an error'
-
-        if warning:
-                return {'content': content, 'titles': titles, 'warning': warning}
-
-        # parse regex
-        if contentregex:
-                contents = [x for y in [re.findall(r'' + contentregex, c, re.S) for c in contents] for x in y]
-        if titleregex:
-                titles = [x for y in [re.findall(r'' + titleregex, c, re.S) for c in titles] for x in y]
-
-        if contentregex and titleregex and len(contents) != len(titles):
-                warning = 'WARNING: number of title blocks (' + str(len(titles)) + ') does not match number of content blocks (' + str(len(contents)) + ') after regex'
-        elif contentregex and len(contents) == 0:
-                warning = 'WARNING: content regex became invalid!'
-        elif titleregex and len(titles) == 0:
-                warning = 'WARNING: title regex became invalid!'
+    global defaultEncoding
+    file, content, titles, warning = None, None, None, None
+
+    uri = site['uri']
+    contenttype = site.get('type', 'html')
+    contentregex = site.get('contentregex', '')
+    titleregex = site.get('titleregex', '')
+    splitregex = site.get('splitregex', '')
+    enc = site.get('encoding', defaultEncoding)
+
+    contentxpath = site.get('contentxpath', '')
+    if contentxpath == '' and site.get('contentcss', '') != '':
+        # CSS
+        contentxpath = GenericTranslator().css_to_xpath(site.get('contentcss'))
+    titlexpath = site.get('titlexpath', '')
+    if titlexpath == '' and site.get('titlecss', '') != '':
+        titlexpath = GenericTranslator().css_to_xpath(site.get('titlecss'))
+
+    try:
+
+        if uri.startswith(cmdscheme):
+            # run command and retrieve output
+            process = subprocess.Popen(uri[len(cmdscheme):], stdout=subprocess.PIPE, shell=True, close_fds=True)
+            file = process.stdout
         else:
-                if len(contents) == 0:
-                        contents = titles
-                if len(titles) == 0:
-                        titles = [getSubject(c) for c in contents]
-
-        return {'contents': contents, 'titles': titles, 'warning': warning}
+            # open website
+            req = urllib.request.Request(uri)
+            if 'user-agent' in site:
+                req.add_header('User-Agent', site['user-agent'])
+            if 'accept' in site:
+                req.add_header('Accept', site['accept'])
+            file = urllib.request.urlopen(req)
+
+
+        if contenttype == 'text' or (contentxpath == '' and titlexpath == ''):
+            thefullcontent = file.read().decode(enc, errors='ignore')
+            contents = [thefullcontent]
+            if splitregex != '':
+                contents = thefullcontent.split(splitregex)
+            titles = []
+        else:
+            baseuri = uri
+            if contenttype == 'html':
+                parser = etree.HTMLParser(encoding=enc)
+            else:
+                parser = etree.XMLParser(recover=True, encoding=enc)
+
+            tree = etree.parse(file, parser)
+
+            # xpath
+            contentresult = tree.xpath(contentxpath) if contentxpath else []
+            titleresult = tree.xpath(titlexpath) if titlexpath else []
+
+            # translate relative URIs to absolute URIs
+            if contenttype == 'html':
+                basetaglist = tree.xpath('/html/head/base')
+                if len(basetaglist) != 0:
+                    baseuri = basetaglist[0].attrib['href']
+                if len(contentresult) != 0:
+                    toAbsoluteURIs(contentresult, baseuri)
+                if len(titleresult) != 0:
+                    toAbsoluteURIs(titleresult, baseuri)
+
+            if contentxpath != '' and titlexpath != '' and len(contentresult) != len(titleresult):
+                warning = 'WARNING: number of title blocks (' + str(len(titleresult)) + ') does not match number of content blocks (' + str(len(contentresult)) + ')'
+            elif contentxpath and len(contentresult) == 0:
+                warning = 'WARNING: content selector became invalid!'
+            elif titlexpath and len(titleresult) == 0:
+                warning = 'WARNING: title selector became invalid!'
+            else:
+                if len(contentresult) == 0:
+                    contentresult = titleresult
+                if len(titleresult) == 0:
+                    titleresult = contentresult
+
+            if isinstance(contentresult, str):
+                contents = [contentresult]
+            else:
+                contents = [etree.tostring(s, encoding=enc, pretty_print=True).decode(enc, errors='ignore') for s in contentresult]
+            if isinstance(titleresult, str):
+                titles = [getSubject(titleresult)]
+            else:
+                titles = [getSubject(etree.tostring(s, method='text', encoding=enc).decode(enc, errors='ignore')) for s in titleresult]
+
+    except IOError as e:
+        warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e)
+
+    if file is not None:
+        file.close()
+
+    if uri.startswith(cmdscheme) and process.wait() != 0:
+        warning = 'WARNING: process terminated with an error'
+
+    if warning:
+        return {'content': content, 'titles': titles, 'warning': warning}
+
+    # parse regex
+    if contentregex:
+        contents = [x for y in [re.findall(r'' + contentregex, c, re.S) for c in contents] for x in y]
+    if titleregex:
+        titles = [x for y in [re.findall(r'' + titleregex, c, re.S) for c in titles] for x in y]
+
+    if contentregex and titleregex and len(contents) != len(titles):
+        warning = 'WARNING: number of title blocks (' + str(len(titles)) + ') does not match number of content blocks (' + str(len(contents)) + ') after regex'
+    elif contentregex and len(contents) == 0:
+        warning = 'WARNING: content regex became invalid!'
+    elif titleregex and len(titles) == 0:
+        warning = 'WARNING: title regex became invalid!'
+    else:
+        if len(contents) == 0:
+            contents = titles
+        if len(titles) == 0:
+            titles = [getSubject(c) for c in contents]
+
+    return {'contents': contents, 'titles': titles, 'warning': warning}
 
 
 # returns a short subject line
 def getSubject(textContent):
-        if textContent == None or textContent == '':
-                return config.subjectPostfix
-        textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip()
-        return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent
+    if textContent == None or textContent == '':
+        return config.subjectPostfix
+    textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip()
+    return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent
 
 
 # generates a new RSS feed item
 def genFeedItem(subject, content, link, change):
-        feeditem = etree.Element('item')
-        titleitem = etree.Element('title')
-        titleitem.text = subject + ' #' + str(change)
-        feeditem.append(titleitem)
-        linkitem = etree.Element('link')
-        linkitem.text = link
-        feeditem.append(linkitem)
-        descriptionitem = etree.Element('description')
-        descriptionitem.text = content
-        feeditem.append(descriptionitem)
-        guiditem = etree.Element('guid')
-        guiditem.text = str(random.getrandbits(32))
-        feeditem.append(guiditem)
-        dateitem = etree.Element('pubDate')
-        dateitem.text = strftime("%a, %d %b %Y %H:%M:%S %Z", time.localtime())
-        feeditem.append(dateitem)
-
-        return feeditem
+    feeditem = etree.Element('item')
+    titleitem = etree.Element('title')
+    titleitem.text = subject + ' #' + str(change)
+    feeditem.append(titleitem)
+    linkitem = etree.Element('link')
+    linkitem.text = link
+    feeditem.append(linkitem)
+    descriptionitem = etree.Element('description')
+    descriptionitem.text = content
+    feeditem.append(descriptionitem)
+    guiditem = etree.Element('guid')
+    guiditem.text = str(random.getrandbits(32))
+    feeditem.append(guiditem)
+    dateitem = etree.Element('pubDate')
+    dateitem.text = strftime("%a, %d %b %Y %H:%M:%S %Z", time.localtime())
+    feeditem.append(dateitem)
+
+    return feeditem
 
 
 # sends mail notification
 def sendmail(receiver, subject, content, sendAsHtml, link):
-        global mailsession
-
-        if sendAsHtml:
-                baseurl = None
-                if link != None:
-                        content = '' + subject + '\n' + content
-                        baseurl = urljoin(link, '/')
-                mail = MIMEText('' + subject + '' + ('' if baseurl else '') + '' + content + '', 'html', defaultEncoding)
-        else:
-                if link != None:
-                        content = link + '\n\n' + content
-                mail = MIMEText(content, 'text', defaultEncoding)
+    global mailsession, defaultEncoding
+
+    if sendAsHtml:
+        baseurl = None
+        if link != None:
+            content = '' + subject + '\n' + content
+            baseurl = urljoin(link, '/')
+        mail = MIMEText('' + subject + '' + ('' if baseurl else '') + '' + content + '', 'html', defaultEncoding)
+    else:
+        if link != None:
+            content = link + '\n\n' + content
+        mail = MIMEText(content, 'text', defaultEncoding)
+
+    mail['From'] = config.sender
+    mail['To'] = receiver
+    mail['Subject'] = Header(subject, defaultEncoding)
+
+    # initialize session once, not each time this method gets called
+    if mailsession is None:
+        mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
+        if config.useTLS:
+            mailsession.ehlo()
+            mailsession.starttls()
+        if config.smtpusername is not None:
+            mailsession.login(config.smtpusername, config.smtppwd)
+
+    mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
 
-        mail['From'] = config.sender
-        mail['To'] = receiver
-        mail['Subject'] = Header(subject, defaultEncoding)
 
-        # initialize session once, not each time this method gets called
-        if mailsession is None:
-                mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
-                if config.useTLS:
-                        mailsession.ehlo()
-                        mailsession.starttls()
-                if config.smtpusername is not None:
-                    mailsession.login(config.smtpusername, config.smtppwd)
-
-        mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
+# returns a list of all content that is stored locally for a specific site
+def getStoredHashes(shortname):
 
+    result = []
+    filename = shortname + ".txt"
+    if os.path.exists(filename):
+        with open(filename, 'r') as file:
+            for line in file:
+                result.append(line.rstrip())
 
-# returns a list of all content that is stored locally for a specific site
-def getFileContents(shortname):
-        result = []
-        for f in os.listdir('.'):
-                if f.startswith(shortname + '.') and f.endswith('.txt'):
-                        file = open(f, 'rb')
-                        result.append(file.read().decode('utf-8'))
-                        file.close()
-        return result
+    return result
 
 
 # updates list of content that is stored locally for a specific site
-def storeFileContents(shortname, contents):
-        for f in os.listdir('.'):
-                if f.startswith(shortname + '.') and f.endswith('.txt'):
-                        os.remove(f)
-
-        i = 0
-        for c in contents:
-                file = open(shortname + '.' + str(i) + '.txt', 'wb')
-                file.write(c.encode('utf-8'))
-                file.close()
-                i += 1
+def storeHashes(shortname, contentHashes):
 
+    with open(shortname + '.txt', 'w') as file:
+        for h in contentHashes:
+            file.write(h + "\n")
 
-def pollWebsites():
 
-        # parse existing feed or create a new one
-        if config.enableRSSFeed:
-                if os.path.isfile(config.rssfile):
-                        feedXML = etree.parse(config.rssfile)
-                else:
-                        feedXML = etree.parse(io.StringIO(emptyfeed))
+def pollWebsites():
+    global defaultEncoding
 
-        # start polling sites
-        sessionContents = []
-        mailsSent = 0
-        for site in config.sites:
-                if config.maxMailsPerSession != -1 and mailsSent >= config.maxMailsPerSession:
-                        break
+    # parse existing feed or create a new one
+    if config.enableRSSFeed:
+        if os.path.isfile(config.rssfile):
+            feedXML = etree.parse(config.rssfile)
+        else:
+            feedXML = etree.parse(io.StringIO(emptyfeed))
+
+    # start polling sites
+    mailsSent = 0
+    for site in config.sites:
+        print('polling site [' + site['shortname'] + '] ...')
+        sessionHashes = []
+        parseResult = parseSite(site)
+        receiver = site.get('receiver', config.receiver)
+
+        # if something went wrong, notify the user
+        if parseResult['warning']:
+            subject = '[' + site['shortname'] + '] WARNING'
+            print('WARNING: ' + parseResult['warning'])
+            if config.enableMailNotifications:
+                if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession:
+                    sendmail(receiver, subject, parseResult['warning'], False, None)
+                    mailsSent = mailsSent + 1
+            if config.enableRSSFeed:
+                feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0))
+        else:
+            # otherwise, check which parts of the site were updated
+            changes = 0
+            fileHashes = getStoredHashes(site['shortname'])
+            i = 0
+            for content in parseResult['contents']:
+
+                contenthash = hashlib.md5(content.encode(defaultEncoding)).hexdigest()
+                if contenthash not in fileHashes:
+                    if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession:
+                        changes += 1
+                        sessionHashes.append(contenthash)
+
+                        subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i]
+                        print('    ' + subject)
+                        if config.enableMailNotifications and len(fileHashes) > 0:
+                            sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri'])
+                            mailsSent = mailsSent + 1
 
-                print('polling site [' + site['shortname'] + '] ...')
-                parseResult = parseSite(site)
-                receiver = site.get('receiver', config.receiver)
-
-                # if something went wrong, notify the user
-                if parseResult['warning']:
-                        subject = '[' + site['shortname'] + '] WARNING'
-                        print('WARNING: ' + parseResult['warning'])
-                        if config.enableMailNotifications:
-                                sendmail(receiver, subject, parseResult['warning'], False, None)
-                                mailsSent = mailsSent + 1
                         if config.enableRSSFeed:
-                                feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0))
+                            feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes))
                 else:
-                        # otherwise, check which parts of the site were updated
-                        changes = 0
-                        fileContents = getFileContents(site['shortname'])
-                        i = 0
-                        for content in parseResult['contents']:
-                                if config.maxMailsPerSession != -1 and mailsSent >= config.maxMailsPerSession:
-                                        break
-
-                                if content not in fileContents:
-                                        changes += 1
-                                        sessionContents.append(content)
-
-                                        subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i]
-                                        print('    ' + subject)
-                                        if config.enableMailNotifications and len(fileContents) > 0:
-                                                sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri'])
-                                                mailsSent = mailsSent + 1
-
-                                        if config.enableRSSFeed:
-                                                feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes))
-                                i += 1
-
-
-                        if changes > 0:
-                                storeFileContents(site['shortname'], sessionContents)
-                                print('        ' + str(changes) + ' updates')
- 
-        # store feed
-        if config.enableRSSFeed:
-                for o in feedXML.xpath('//channel/item[position() 0:
+                storeHashes(site['shortname'], sessionHashes)
+                print('        ' + str(changes) + ' updates')
 
-        try:
-                opts, args = getopt.getopt(sys.argv[1:], 'hc:d:', ['help', 'config=', 'dry-run='])
-        except getopt.GetoptError:
-                print('Usage: mwc.py --config=config --dry-run=shortname')
-                sys.exit(1)
-        for opt, arg in opts:
-                if opt == '-h':
-                        print('Usage: mwc.py --config=config')
-                        exit()
-                elif opt in ('-c', '--config'):
-                        configMod = arg
-                elif opt in ('-d', '--dry-run'):
-                        dryrun = arg
-
-        config = importlib.import_module(configMod)
-
-        if dryrun:
-                for site in config.sites:
-                        if site['shortname'] == dryrun:
-                                parseResult = parseSite(site)
-                                print(parseResult)
-                                print(str(len(parseResult['contents'])) + " results")
-                                break
-        else:
-                try:
-                        pollWebsites()
-                except:
-                        msg = str(sys.exc_info()[0]) + '\n\n' + traceback.format_exc()
-                        print(msg)
-                        if config.receiver != '':
-                                sendmail(config.receiver, '[mwc] Something went wrong ...', msg, False, None)
-
-                if mailsession:
-                        mailsession.quit()
-                        mailsession = None
+    # store feed
+    if config.enableRSSFeed:
+        for o in feedXML.xpath('//channel/item[position()
Date: Fri, 11 Aug 2017 05:41:14 +0200
Subject: New upstream release; refresh patches

---
 debian/changelog                  |   5 +-
 debian/patches/0100-config.diff   |  81 +++++++++++++-------------
 debian/patches/0105-try_mail.diff |  68 +++++++++++-----------
 debian/patches/0110-syslog.diff   | 118 ++++++++++++++------------------------
 4 files changed, 119 insertions(+), 153 deletions(-)

diff --git a/debian/changelog b/debian/changelog
index 18767a8..63449fb 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,6 +1,7 @@
-mwc (1.7.6-1) UNRELEASED; urgency=medium
+mwc (1.8.2-1) UNRELEASED; urgency=medium
 
-  * New upstream release.
+  * New upstream release (Closes: #862004).
+    + Refresh patches.
   * Renumbering patches.
   * debian/patches/0105-try_mail.diff:
     - Replace undefined printf with print (Closes: #860494).
diff --git a/debian/patches/0100-config.diff b/debian/patches/0100-config.diff
index ce4dba7..8529874 100644
--- a/debian/patches/0100-config.diff
+++ b/debian/patches/0100-config.diff
@@ -5,47 +5,6 @@ Author: Jörg Frings-Fürst 
 Last-Update: 2014-05-12
 ---
 This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
-Index: trunk/mwc.py
-===================================================================
---- trunk.orig/mwc.py
-+++ trunk/mwc.py
-@@ -319,7 +319,7 @@ def pollWebsites():
- 
- if __name__ == "__main__":
- 
--        configMod = 'config'
-+        configMod = '/etc/mwc/mwc-config'
-         dryrun = None
- 
-         try:
-@@ -335,9 +335,24 @@ if __name__ == "__main__":
-                         configMod = arg
-                 elif opt in ('-d', '--dry-run'):
-                         dryrun = arg
--
--        config = importlib.import_module(configMod)
--
-+	# 
-+	# add code to load config from nonsystem path
-+	# and change to datadir
-+	#
-+        try:
-+            path = os.path.dirname(configMod)
-+            fullname = os.path.basename(configMod)
-+            sys.path.append(path)
-+            config = importlib.import_module(fullname)
-+        except: 
-+            print('Error: loading config')
-+            sys.exit(2)
-+        try:
-+            os.chdir(config.datadir)
-+        except: 
-+            print('Error: datadir not found')
-+            sys.exit(3)
-+    
-         if dryrun:
-                 for site in config.sites:
-                         if site['shortname'] == dryrun:
 Index: trunk/config_template.py
 ===================================================================
 --- trunk.orig/config_template.py
@@ -56,7 +15,7 @@ Index: trunk/config_template.py
  # Copyright: (2013-2014) Michael Till Beck 
  # License: GPL-2.0+
  
-@@ -46,11 +44,11 @@ sender = 'me@mymail.com'
+@@ -47,11 +45,11 @@ sender = 'me@mymail.com'
  smtphost = 'mysmtpprovider.com'
  useTLS = True
  smtpport = 587
@@ -113,3 +72,41 @@ Index: trunk/mwcfeedserver.py
 +except KeyboardInterrupt:
 +	pass
 +httpd.server_close()
+Index: trunk/mwc.py
+===================================================================
+--- trunk.orig/mwc.py
++++ trunk/mwc.py
+@@ -334,7 +334,7 @@ def pollWebsites():
+ 
+ if __name__ == "__main__":
+ 
+-    configMod = 'config'
++    configMod = '/etc/mwc/mwc-config'
+     dryrun = None
+ 
+     try:
+@@ -351,7 +351,23 @@ if __name__ == "__main__":
+         elif opt in ('-d', '--dry-run'):
+             dryrun = arg
+ 
+-    config = importlib.import_module(configMod)
++    #
++    # add code to load config from nonsystem path
++    # and change to datadir
++    #
++    try:
++        path = os.path.dirname(configMod)
++        fullname = os.path.basename(configMod)
++        sys.path.append(path)
++        config = importlib.import_module(fullname)
++    except:
++        print('Error: loading config')
++        sys.exit(2)
++    try:
++        os.chdir(config.datadir)
++    except:
++        print('Error: datadir not found')
++        sys.exit(3)
+ 
+     if dryrun:
+         for site in config.sites:
diff --git a/debian/patches/0105-try_mail.diff b/debian/patches/0105-try_mail.diff
index d390b6d..bc62ef1 100644
--- a/debian/patches/0105-try_mail.diff
+++ b/debian/patches/0105-try_mail.diff
@@ -12,41 +12,41 @@ Index: trunk/mwc.py
 ===================================================================
 --- trunk.orig/mwc.py
 +++ trunk/mwc.py
-@@ -225,16 +225,27 @@ def sendmail(receiver, subject, content,
-         mail['Subject'] = Header(subject, defaultEncoding)
+@@ -232,16 +232,28 @@ def sendmail(receiver, subject, content,
+     mail['Subject'] = Header(subject, defaultEncoding)
  
-         # initialize session once, not each time this method gets called
--        if mailsession is None:
--                mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
--                if config.useTLS:
--                        mailsession.ehlo()
--                        mailsession.starttls()
--                if config.smtpusername is not None:
--                    mailsession.login(config.smtpusername, config.smtppwd)
+     # initialize session once, not each time this method gets called
+-    if mailsession is None:
+-        mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
+-        if config.useTLS:
+-            mailsession.ehlo()
+-            mailsession.starttls()
+-        if config.smtpusername is not None:
+-            mailsession.login(config.smtpusername, config.smtppwd)
 -
--        mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
--
-+        #
-+        # add try / except to open mailsession
-+        #
-+        try:
-+    		if mailsession is None:
-+            		mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
-+            		if config.useTLS:
-+                    		mailsession.ehlo()
-+                    		mailsession.starttls()
-+            		mailsession.login(config.smtpusername, config.smtppwd)
-+        #
-+        # add try / except to send mail
-+        #
-+        except:
-+    		print('Error: Open smtp-session')
-+    		exit(4)
-+    	try:
-+    		mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
-+        except:
-+    		print('Error: sendmail')
-+    		exit(5)
+-    mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
+ 
++    #
++    # add try / except to open mailsession
++    #
++    try:
++	if mailsession is None:
++	    mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
++	    if config.useTLS:
++            	mailsession.ehlo()
++            	mailsession.starttls()
++        	mailsession.login(config.smtpusername, config.smtppwd)
++    #
++    # add try / except to send mail
++    #
++    except:
++    	print('Error: Open smtp-session')
++    	exit(4)
++    try:
++    	mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
++    except:
++    	print('Error: sendmail')
++    	exit(5)
  
  # returns a list of all content that is stored locally for a specific site
- def getFileContents(shortname):
+ def getStoredHashes(shortname):
diff --git a/debian/patches/0110-syslog.diff b/debian/patches/0110-syslog.diff
index 12d629d..bd61d81 100644
--- a/debian/patches/0110-syslog.diff
+++ b/debian/patches/0110-syslog.diff
@@ -8,7 +8,7 @@ Index: trunk/mwc.py
 ===================================================================
 --- trunk.orig/mwc.py
 +++ trunk/mwc.py
-@@ -19,6 +19,7 @@ import os
+@@ -21,6 +21,7 @@ import os
  import sys
  import getopt
  import traceback
@@ -16,81 +16,49 @@ Index: trunk/mwc.py
  
  import subprocess
  
-@@ -227,25 +228,28 @@ def sendmail(receiver, subject, content,
-         # initialize session once, not each time this method gets called
-         #
-         # add try / except to open mailsession
--        #
-+	#
-+	
-         try:
--    		if mailsession is None:
--            		mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
--            		if config.useTLS:
--                    		mailsession.ehlo()
--                    		mailsession.starttls()
--            		mailsession.login(config.smtpusername, config.smtppwd)
--        #
-+                if mailsession is None:
-+                        mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
-+                        if config.useTLS:
-+                                mailsession.ehlo()
-+                                mailsession.starttls()
-+                        mailsession.login(config.smtpusername, config.smtppwd)
-+        except:
-+                print('Error: Open smtp-session')
-+                syslog.syslog(syslog.LOG_ERR, 'can not open smtp session')
-+                exit(4)
-+	#
-         # add try / except to send mail
-         #
-+        try:
-+                mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
-         except:
--    		print('Error: Open smtp-session')
--    		exit(4)
--    	try:
--    		mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
--        except:
--    		print('Error: sendmail')
--    		exit(5)
-+                print('Error: sendmail')
-+                syslog.syslog(syslog.LOG_ERR, 'error on sendmail')
-+                exit(5)
+@@ -248,11 +249,13 @@ def sendmail(receiver, subject, content,
+     #
+     except:
+     	print('Error: Open smtp-session')
++	syslog.syslog(syslog.LOG_ERR, 'can not open smtp session')
+     	exit(4)
+     try:
+     	mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
+     except:
+     	print('Error: sendmail')
++	syslog.syslog(syslog.LOG_ERR, 'error on sendmail')
+     	exit(5)
  
  # returns a list of all content that is stored locally for a specific site
- def getFileContents(shortname):
-@@ -332,7 +336,11 @@ if __name__ == "__main__":
+@@ -349,6 +352,11 @@ if __name__ == "__main__":
+     configMod = '/etc/mwc/mwc-config'
+     dryrun = None
  
-         configMod = '/etc/mwc/mwc-config'
-         dryrun = None
--
-+        
-+        #
-+        # add syslog open
-+        #
-+        syslog.openlog()
-         try:
-                 opts, args = getopt.getopt(sys.argv[1:], 'hc:d:', ['help', 'config=', 'dry-run='])
-         except getopt.GetoptError:
-@@ -357,11 +365,13 @@ if __name__ == "__main__":
-             config = importlib.import_module(fullname)
-         except: 
-             print('Error: loading config')
-+            syslog.syslog(syslog.LOG_ERR, 'can not found / load mwc-config')
-             sys.exit(2)
-         try:
-             os.chdir(config.datadir)
-         except: 
-             print('Error: datadir not found')
-+            syslog.syslog(syslog.LOG_ERR, 'datadir not found')
-             sys.exit(3)
-     
-         if dryrun:
-@@ -383,3 +393,5 @@ if __name__ == "__main__":
-                         mailsession.quit()
-                         mailsession = None
++    #
++    # add syslog open
++    #
++    syslog.openlog()
++
+     try:
+         opts, args = getopt.getopt(sys.argv[1:], 'hc:d:', ['help', 'config=', 'dry-run='])
+     except getopt.GetoptError:
+@@ -374,11 +382,13 @@ if __name__ == "__main__":
+         config = importlib.import_module(fullname)
+     except:
+         print('Error: loading config')
++	syslog.syslog(syslog.LOG_ERR, 'can not found / load mwc-config')
+         sys.exit(2)
+     try:
+         os.chdir(config.datadir)
+     except:
+         print('Error: datadir not found')
++	syslog.syslog(syslog.LOG_ERR, 'datadir not found')
+         sys.exit(3)
  
-+        syslog.closelog()
-+	
-\ No newline at end of file
+     if dryrun:
+@@ -400,3 +410,5 @@ if __name__ == "__main__":
+         if mailsession:
+             mailsession.quit()
+             mailsession = None
++
++    syslog.closelog()
-- 
cgit v1.2.3


From 04b13e003d6af0de21e6c59e411ffee5b97b6134 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= 
Date: Sun, 1 Oct 2017 18:50:17 +0200
Subject: New upstream version 2.0.4

---
 README.md          |  89 ++++++++++------
 config_template.py |  74 ++++++-------
 mwc.py             | 298 +++++++++++++++--------------------------------------
 mwcfeedserver.py   |  31 +++---
 mwctools.py        | 239 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 433 insertions(+), 298 deletions(-)
 mode change 100644 => 100755 config_template.py
 create mode 100755 mwctools.py

diff --git a/README.md b/README.md
index 8e78da6..69718e7 100644
--- a/README.md
+++ b/README.md
@@ -19,20 +19,23 @@ Some examples:
 
 sites = [
 
-          {'shortname': 'mywebsite1',
-           'uri': 'http://www.mywebsite1.com/info',
-           'contentcss': 'div'},
-
-          {'shortname': 'mywebsite2',
-           'uri': 'http://www.mywebsite2.com/info',
-           'contentxpath': '//*[contains(concat(\' \', normalize-space(@class), \' \'), \' news-list-container \')]',
-           'titlexpath': '//title'},
-
-          {'shortname': 'mywebsite3',
-           'uri': 'http://www.mywebsite3.com/info',
-           'type': 'text',
-           'contentregex': 'Version\"\:\d*\.\d*',
-           'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'}
+         {'name': 'example-css',
+          'parsers': [uri(uri='https://github.com/mtill', contenttype='html'),
+                      css(contentcss='div')
+                     ]
+         },
+
+         {'name': 'example-xpath',
+          'parsers': [uri(uri='https://example-webpage.com/test', contenttype='html'),
+                      xpath(contentxpath='//div[contains(concat(\' \', normalize-space(@class), \' \'), \' package-version-header \')]')
+                     ]
+         },
+
+         {'name': 'my-script',
+          'parsers': [command(command='/home/user/script.sh', contenttype='text'),
+                      regex(contentregex='^.*$')
+                     ]
+         }
 
 ]
 
@@ -40,31 +43,55 @@ sites = [
 
  * parameters:
 
-   * shortname  
-     short name of the entry, used as an identifier when sending email notifications
+   * name  
+     name of the entry, used as an identifier when sending email notifications
+   * receiver (optional)  
+     Overrides global receiver specification.
+
+ * parameters for the URL receiver:
+
    * uri  
-     URI of the website; If the scheme of the uri is 'cmd://', the string is interpreted as a command and the standard output (stdout) is parsed.
-   * type (optional; default: 'html')  
+     URI of the website
+   * contenttype (optional; default: 'html')  
      content type, e.g., 'xml'/'html'/'text'.
-   * contentxpath / titlexpath (optional)  
-     XPath expression for the content/title sections to extract. If you prefer, you could use contentcss/titlecss instead.
-   * contentcss / titlecss (optional)  
-     CSS expression for the content/title sections to extract. This is ignored if there is a corresponding XPath definition.
-   * contentregex / titleregex (optional)  
-     Regular expression. If XPath/CSS selector is defined, the regular expression is applied afterwards.
-   * encoding (optional; default: 'utf-8')  
+   * enc (optional; default: 'utf-8')  
      Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'.
-   * splitregex (optional)  
-     only works if type is set to 'text'; defines that content should be split to chunks based on the defined regex expression.
-   * receiver (optional)  
-     Overrides global receiver specification.
-   * user-agent (optional)  
+   * userAgent (optional)  
      Defines the user agent string, e.g.,  
-     'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'
+     'userAgent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'
    * accept (optional)  
      Defines the accept string, e.g.,  
      'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
 
+ * parameters for the Command receiver
+
+   * command  
+     the command
+   * contenttype (optional; default: 'text')  
+     content type, e.g., 'xml'/'html'/'text'.
+   * enc (optional; default: 'utf-8')  
+     Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'.
+
+ * parameters for the XPath parser:
+
+   * contentxpath  
+     XPath expression for the content sections to extract
+   * titlexpath (optional)  
+     XPath expression for the title sections to extract
+
+ * parameters for the CSS parser:
+
+   * contentcss  
+     CSS expression for the content sections to extract
+   * titlecss (optional)  
+     CSS expression for the title sections to extract
+
+ * parameters for the RegEx parser:
+
+   * contentregex  
+     Regular expression for content parsing
+   * titleregex (optional)  
+     Regular expression for title parsing
 
  * We collect some XPath/CSS snippets at this place: Snippet collection - please feel free to add your own definitions!
 
diff --git a/config_template.py b/config_template.py
old mode 100644
new mode 100755
index f394e52..02788bd
--- a/config_template.py
+++ b/config_template.py
@@ -1,47 +1,49 @@
-import os.path
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
 
-# Copyright: (2013-2014) Michael Till Beck 
+# Copyright: (2013-2017) Michael Till Beck 
 # License: GPL-2.0+
 
-#We collect xpath snippets at this place: Snippet collection - please feel free to add your own definitions!
+
+# We collect xpath snippets at this place:
+# Snippet collection
+# Feel free to contribute!
+
+
+from mwctools import URLReceiver as uri
+from mwctools import CommandReceiver as command
+from mwctools import XPathParser as xpath
+from mwctools import CSSParser as css
+from mwctools import RegExParser as regex
+from mwctools import Content
+from mwctools import Parser
+
 
 sites = [
 
-          {'shortname': 'mywebsite1',
-           'uri': 'http://www.mywebsite1.com/info',
-           'type': 'html',
-           'titlexpath': '//h1',
-           'contentxpath': '//div',
-           'titleregex': '',
-           'contentregex': '',
-           'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0',
-           'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
-           'encoding': 'utf-8'},
-
-          {'shortname': 'mywebsite2',
-           'uri': 'http://www.mywebsite2.com/info',
-           'type': 'html',
-           'contentxpath': '//*[contains(concat(\' \', normalize-space(@class), \' \'), \' news-list-container \')]',
-           'regex': '',
-           'encoding': 'utf-8'},
-
-          {'shortname': 'mywebsite3',
-           'uri': 'http://www.mywebsite3.com/info',
-           'type': 'text',
-           'contentxpath': '',
-           'contentregex': 'Version\"\:\d*\.\d*',
-           'encoding': 'utf-8'},
-
-          {'shortname': 'lscmd',
-           'uri': 'cmd://ls -l /home/pi',
-           'contentregex': '.*Desktop.*'
-          }
+         {'name': 'example-css',
+          'parsers': [uri(uri='https://github.com/mtill', contenttype='html'),
+                      css(contentcss='div')
+                     ]
+         },
+
+         {'name': 'example-xpath',
+          'parsers': [uri(uri='https://example-webpage.com/test', contenttype='html'),
+                      xpath(contentxpath='//div[contains(concat(\' \', normalize-space(@class), \' \'), \' package-version-header \')]')
+                     ]
+         },
+
+         {'name': 'my-script',
+          'parsers': [command(command='/home/user/script.sh', contenttype='text'),
+                      regex(contentregex='^.*$')
+                     ]
+         }
 
 ]
 
-subjectPostfix = 'A website has been updated!'
+workingDirectory = '/path-to-data-dir/MailWebsiteChanges-data'
 
-enableMailNotifications = True
+enableMailNotifications = False
 maxMailsPerSession = -1
 sender = 'me@mymail.com'
 smtphost = 'mysmtpprovider.com'
@@ -51,9 +53,7 @@ smtpusername = sender
 smtppwd = 'mypassword'
 receiver = 'me2@mymail.com'
 
-os.chdir('/var/cache/mwc')
-
-enableRSSFeed = True
+enableRSSFeed = False
 rssfile = 'feed.xml'
 maxFeeds = 100
 
diff --git a/mwc.py b/mwc.py
index 4df4799..6a48317 100755
--- a/mwc.py
+++ b/mwc.py
@@ -1,15 +1,11 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-# Copyright: (2013-2015) Michael Till Beck 
+# Copyright: (2013-2017) Michael Till Beck 
 # License: GPL-2.0+
 
-import urllib.request, urllib.error, urllib.parse
-import urllib.parse
-from lxml import etree
-from cssselect import GenericTranslator
-import re
 import io
+from lxml import etree
 import hashlib
 
 import smtplib
@@ -22,8 +18,6 @@ import sys
 import getopt
 import traceback
 
-import subprocess
-
 import time
 from time import strftime
 import random
@@ -32,164 +26,20 @@ import importlib
 config = None
 
 defaultEncoding = 'utf-8'
-maxTitleLength = 150
 
 # this is how an empty RSS feed looks like
 emptyfeed = """
 
  
   MailWebsiteChanges Feed
-  https://github.com/Debianguru/MailWebsiteChanges
+  https://github.com/mtill/MailWebsiteChanges
   MailWebsiteChanges Feed
  
 """
 
-# Attributes in HTML files storing URI values. These values are automatically translated to absolute URIs.
-uriAttributes = [['//img[@src]', 'src'], ['//a[@href]', 'href']]
-cmdscheme = 'cmd://'
-
 mailsession = None
 
 
-# translates all relative URIs found in trees to absolute URIs
-def toAbsoluteURIs(trees, baseuri):
-    for tree in trees:
-        if isinstance(tree, str):
-            continue
-        for uriAttribute in uriAttributes:
-            tags = tree.xpath(uriAttribute[0])
-            for tag in tags:
-                if tag.attrib.get(uriAttribute[1]) != None:
-                    if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '':
-                        tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]])
-
-
-def parseSite(site):
-    global defaultEncoding
-    file, content, titles, warning = None, None, None, None
-
-    uri = site['uri']
-    contenttype = site.get('type', 'html')
-    contentregex = site.get('contentregex', '')
-    titleregex = site.get('titleregex', '')
-    splitregex = site.get('splitregex', '')
-    enc = site.get('encoding', defaultEncoding)
-
-    contentxpath = site.get('contentxpath', '')
-    if contentxpath == '' and site.get('contentcss', '') != '':
-        # CSS
-        contentxpath = GenericTranslator().css_to_xpath(site.get('contentcss'))
-    titlexpath = site.get('titlexpath', '')
-    if titlexpath == '' and site.get('titlecss', '') != '':
-        titlexpath = GenericTranslator().css_to_xpath(site.get('titlecss'))
-
-    try:
-
-        if uri.startswith(cmdscheme):
-            # run command and retrieve output
-            process = subprocess.Popen(uri[len(cmdscheme):], stdout=subprocess.PIPE, shell=True, close_fds=True)
-            file = process.stdout
-        else:
-            # open website
-            req = urllib.request.Request(uri)
-            if 'user-agent' in site:
-                req.add_header('User-Agent', site['user-agent'])
-            if 'accept' in site:
-                req.add_header('Accept', site['accept'])
-            file = urllib.request.urlopen(req)
-
-
-        if contenttype == 'text' or (contentxpath == '' and titlexpath == ''):
-            thefullcontent = file.read().decode(enc, errors='ignore')
-            contents = [thefullcontent]
-            if splitregex != '':
-                contents = thefullcontent.split(splitregex)
-            titles = []
-        else:
-            baseuri = uri
-            if contenttype == 'html':
-                parser = etree.HTMLParser(encoding=enc)
-            else:
-                parser = etree.XMLParser(recover=True, encoding=enc)
-
-            tree = etree.parse(file, parser)
-
-            # xpath
-            contentresult = tree.xpath(contentxpath) if contentxpath else []
-            titleresult = tree.xpath(titlexpath) if titlexpath else []
-
-            # translate relative URIs to absolute URIs
-            if contenttype == 'html':
-                basetaglist = tree.xpath('/html/head/base')
-                if len(basetaglist) != 0:
-                    baseuri = basetaglist[0].attrib['href']
-                if len(contentresult) != 0:
-                    toAbsoluteURIs(contentresult, baseuri)
-                if len(titleresult) != 0:
-                    toAbsoluteURIs(titleresult, baseuri)
-
-            if contentxpath != '' and titlexpath != '' and len(contentresult) != len(titleresult):
-                warning = 'WARNING: number of title blocks (' + str(len(titleresult)) + ') does not match number of content blocks (' + str(len(contentresult)) + ')'
-            elif contentxpath and len(contentresult) == 0:
-                warning = 'WARNING: content selector became invalid!'
-            elif titlexpath and len(titleresult) == 0:
-                warning = 'WARNING: title selector became invalid!'
-            else:
-                if len(contentresult) == 0:
-                    contentresult = titleresult
-                if len(titleresult) == 0:
-                    titleresult = contentresult
-
-            if isinstance(contentresult, str):
-                contents = [contentresult]
-            else:
-                contents = [etree.tostring(s, encoding=enc, pretty_print=True).decode(enc, errors='ignore') for s in contentresult]
-            if isinstance(titleresult, str):
-                titles = [getSubject(titleresult)]
-            else:
-                titles = [getSubject(etree.tostring(s, method='text', encoding=enc).decode(enc, errors='ignore')) for s in titleresult]
-
-    except IOError as e:
-        warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e)
-
-    if file is not None:
-        file.close()
-
-    if uri.startswith(cmdscheme) and process.wait() != 0:
-        warning = 'WARNING: process terminated with an error'
-
-    if warning:
-        return {'content': content, 'titles': titles, 'warning': warning}
-
-    # parse regex
-    if contentregex:
-        contents = [x for y in [re.findall(r'' + contentregex, c, re.S) for c in contents] for x in y]
-    if titleregex:
-        titles = [x for y in [re.findall(r'' + titleregex, c, re.S) for c in titles] for x in y]
-
-    if contentregex and titleregex and len(contents) != len(titles):
-        warning = 'WARNING: number of title blocks (' + str(len(titles)) + ') does not match number of content blocks (' + str(len(contents)) + ') after regex'
-    elif contentregex and len(contents) == 0:
-        warning = 'WARNING: content regex became invalid!'
-    elif titleregex and len(titles) == 0:
-        warning = 'WARNING: title regex became invalid!'
-    else:
-        if len(contents) == 0:
-            contents = titles
-        if len(titles) == 0:
-            titles = [getSubject(c) for c in contents]
-
-    return {'contents': contents, 'titles': titles, 'warning': warning}
-
-
-# returns a short subject line
-def getSubject(textContent):
-    if textContent == None or textContent == '':
-        return config.subjectPostfix
-    textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip()
-    return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent
-
-
 # generates a new RSS feed item
 def genFeedItem(subject, content, link, change):
     feeditem = etree.Element('item')
@@ -213,23 +63,26 @@ def genFeedItem(subject, content, link, change):
 
 
 # sends mail notification
-def sendmail(receiver, subject, content, sendAsHtml, link):
+def sendmail(receiver, subject, content, sendAsHtml, link, encoding=None):
     global mailsession, defaultEncoding
 
+    if encoding is None:
+        encoding = defaultEncoding
+
     if sendAsHtml:
         baseurl = None
-        if link != None:
+        if link is not None:
             content = '' + subject + '\n' + content
             baseurl = urljoin(link, '/')
-        mail = MIMEText('' + subject + '' + ('' if baseurl else '') + '' + content + '', 'html', defaultEncoding)
+        mail = MIMEText('' + subject + '' + ('' if baseurl else '') + '' + content + '', 'html', encoding)
     else:
-        if link != None:
+        if link is not None:
             content = link + '\n\n' + content
-        mail = MIMEText(content, 'text', defaultEncoding)
+        mail = MIMEText(content, 'text', encoding)
 
     mail['From'] = config.sender
     mail['To'] = receiver
-    mail['Subject'] = Header(subject, defaultEncoding)
+    mail['Subject'] = Header(subject, encoding)
 
     # initialize session once, not each time this method gets called
     if mailsession is None:
@@ -244,103 +97,114 @@ def sendmail(receiver, subject, content, sendAsHtml, link):
 
 
 # returns a list of all content that is stored locally for a specific site
-def getStoredHashes(shortname):
-
+def getStoredHashes(name):
     result = []
-    filename = shortname + ".txt"
+    filename = os.path.join(config.workingDirectory, name + ".txt")
     if os.path.exists(filename):
-        with open(filename, 'r') as file:
-            for line in file:
+        with open(filename, 'r') as thefile:
+            for line in thefile:
                 result.append(line.rstrip())
 
     return result
 
 
 # updates list of content that is stored locally for a specific site
-def storeHashes(shortname, contentHashes):
-
-    with open(shortname + '.txt', 'w') as file:
+def storeHashes(name, contentHashes):
+    with open(os.path.join(config.workingDirectory, name + '.txt'), 'w') as thefile:
         for h in contentHashes:
-            file.write(h + "\n")
+            thefile.write(h + "\n")
+
+
+def runParsers(parsers, contentList=None):
+    if contentList is None:
+        contentList = []
+
+    for parser in parsers:
+        contentList = parser.performAction(contentList)
+
+    return contentList
 
 
 def pollWebsites():
     global defaultEncoding
 
     # parse existing feed or create a new one
+    rssfile = config.rssfile
+    if not os.path.isabs(rssfile):
+        rssfile = os.path.join(config.workingDirectory, rssfile)
+
     if config.enableRSSFeed:
-        if os.path.isfile(config.rssfile):
-            feedXML = etree.parse(config.rssfile)
+        if os.path.isfile(rssfile):
+            feedXML = etree.parse(rssfile)
         else:
             feedXML = etree.parse(io.StringIO(emptyfeed))
 
     # start polling sites
     mailsSent = 0
     for site in config.sites:
-        print('polling site [' + site['shortname'] + '] ...')
-        sessionHashes = []
-        parseResult = parseSite(site)
+        print('polling site [' + site['name'] + '] ...')
         receiver = site.get('receiver', config.receiver)
 
-        # if something went wrong, notify the user
-        if parseResult['warning']:
-            subject = '[' + site['shortname'] + '] WARNING'
-            print('WARNING: ' + parseResult['warning'])
+        try:
+            contentList = runParsers(site['parsers'])
+        except Exception as e:
+            # if something went wrong, notify the user
+            subject = '[' + site['name'] + '] WARNING'
+            print('WARNING: ' + str(e))
             if config.enableMailNotifications:
                 if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession:
-                    sendmail(receiver, subject, parseResult['warning'], False, None)
+                    sendmail(receiver=receiver, subject=subject, content=str(e), sendAsHtml=False, link=None)
                     mailsSent = mailsSent + 1
             if config.enableRSSFeed:
-                feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0))
-        else:
-            # otherwise, check which parts of the site were updated
-            changes = 0
-            fileHashes = getStoredHashes(site['shortname'])
-            i = 0
-            for content in parseResult['contents']:
-
-                contenthash = hashlib.md5(content.encode(defaultEncoding)).hexdigest()
-                if contenthash not in fileHashes:
-                    if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession:
-                        changes += 1
-                        sessionHashes.append(contenthash)
-
-                        subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i]
-                        print('    ' + subject)
-                        if config.enableMailNotifications and len(fileHashes) > 0:
-                            sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri'])
-                            mailsSent = mailsSent + 1
-
-                        if config.enableRSSFeed:
-                            feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes))
-                else:
+                feedXML.xpath('//channel')[0].append(genFeedItem(subject, str(e), "", 0))
+            continue
+
+        sessionHashes = []
+        changedContents = []
+        fileHashes = getStoredHashes(site['name'])
+        for content in contentList:
+
+            contenthash = hashlib.md5(content.content.encode(content.encoding)).hexdigest()
+            if contenthash not in fileHashes:
+                if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession:
                     sessionHashes.append(contenthash)
+                    changedContents.append(content)
 
-                i += 1
+                    subject = '[' + site['name'] + '] ' + content.title
+                    print('    ' + subject)
+                    if config.enableMailNotifications and len(fileHashes) > 0:
+                        sendAsHtml = (content.contenttype == 'html')
+                        sendmail(receiver=receiver, subject=subject, content=content.content, sendAsHtml=sendAsHtml, link=content.uri, encoding=content.encoding)
+                        mailsSent = mailsSent + 1
 
+                    if config.enableRSSFeed:
+                        feedXML.xpath('//channel')[0].append(genFeedItem(subject, content.content, content.uri, len(changedContents)))
+            else:
+                sessionHashes.append(contenthash)
+
+        if 'postRun' in site:
+            runParsers(site['postRun'], changedContents)
 
-            if changes > 0:
-                storeHashes(site['shortname'], sessionHashes)
-                print('        ' + str(changes) + ' updates')
+        if len(changedContents) > 0:
+            storeHashes(site['name'], sessionHashes)
+            print('        ' + str(len(changedContents)) + ' updates')
 
     # store feed
     if config.enableRSSFeed:
         for o in feedXML.xpath('//channel/item[position()
+# Copyright: (2013-2017) Michael Till Beck 
 # License: GPL-2.0+
 
+
 import http.server
 import socketserver
 import importlib
 import sys
 import getopt
 
+
 bind = 'localhost'
 port = 8000
 configMod = 'config'
 
 
 try:
-        opts, args = getopt.getopt(sys.argv[1:], 'hc:b:p:', ['help', 'config=', 'bind=', 'port='])
+    opts, args = getopt.getopt(sys.argv[1:], 'hc:b:p:', ['help', 'config=', 'bind=', 'port='])
 except getopt.GetoptError:
-        print('Usage: FeedServer.py --config=config --port=8000')
-        sys.exit(1)
+    print('Usage: FeedServer.py --config=config --port=8000 --bind=localhost')
+    sys.exit(1)
 
 for opt, arg in opts:
-        if opt == '-h':
-                print('Usage: FeedServer.py --config=config --bind=localhost --port=8000')
-                exit()
-        elif opt in ('-c', '--config'):
-                configMod = arg
-        elif opt in ('-b', '--bind'):
-                bind = arg
-        elif opt in ('-p', '--port'):
-                port = int(arg)
+    if opt == '-h':
+        print('Usage: FeedServer.py --config=config --bind=localhost --port=8000')
+        exit()
+    elif opt in ('-c', '--config'):
+        configMod = arg
+    elif opt in ('-b', '--bind'):
+        bind = arg
+    elif opt in ('-p', '--port'):
+        port = int(arg)
 
 config = importlib.import_module(configMod)
 
diff --git a/mwctools.py b/mwctools.py
new file mode 100755
index 0000000..cefbbf0
--- /dev/null
+++ b/mwctools.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright: (2013-2017) Michael Till Beck 
+# License: GPL-2.0+
+
+
+import urllib.request
+import urllib.error
+import urllib.parse
+import subprocess
+
+from lxml import etree
+from cssselect import GenericTranslator
+import re
+
+
+# Attributes in HTML files storing URI values. These values are automatically translated to absolute URIs.
+uriAttributes = [['//img[@src]', 'src'], ['//a[@href]', 'href']]
+
+maxTitleLength = 150
+
+
+class Parser:
+    # input: [Content], output: [Content]
+    def performAction(self, contentList):
+        pass
+
+
+class Receiver(Parser):
+    def __init__(self, uri):
+        self.uri = uri
+
+
+class Content:
+    def __init__(self, uri, encoding, title, content, contenttype):
+        self.uri = uri
+        self.encoding = encoding
+        self.title = title
+        self.content = content
+        self.contenttype = contenttype
+
+
+# returns a short subject line
+def getSubject(textContent):
+    global maxTitleLength
+    
+    if textContent is None or len(textContent.strip()) == 0:
+        return 'Website has been updated'
+    textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip()
+    return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent
+
+
+# translates all relative URIs found in trees to absolute URIs
+def toAbsoluteURIs(trees, baseuri):
+    global uriAttributes
+
+    for tree in trees:
+        if isinstance(tree, str):
+            continue
+        for uriAttribute in uriAttributes:
+            tags = tree.xpath(uriAttribute[0])
+            for tag in tags:
+                if tag.attrib.get(uriAttribute[1]) is not None:
+                    if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '':
+                        tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]])
+
+
+class URLReceiver(Receiver):
+    def __init__(self, uri, contenttype='html', encoding='utf-8', userAgent=None, accept=None):
+        super().__init__(uri)
+        self.contenttype = contenttype
+        self.encoding = encoding
+        self.userAgent = userAgent
+        self.accept = accept
+
+    # input: [Content], output: [Content]
+    def performAction(self, contentList=None):
+        if contentList is None:
+            contentList = []
+        
+        # open website
+        req = urllib.request.Request(self.uri)
+        if self.userAgent is not None:
+            req.add_header('User-Agent', self.userAgent)
+        if self.accept is not None:
+            req.add_header('Accept', self.accept)
+
+        with urllib.request.urlopen(req) as thefile:
+            filecontent = thefile.read().decode(self.encoding, errors='ignore')
+            contentList.append(Content(uri=self.uri, encoding=self.encoding, title=None, content=filecontent, contenttype=self.contenttype))
+
+        return contentList
+
+
+class CommandReceiver(Receiver):
+    def __init__(self, command, contenttype='text', encoding='utf-8'):
+        super().__init__(command)
+        self.encoding = encoding
+        self.command = command
+        self.contenttype = contenttype
+
+    # input: [Content], output: [Content]
+    def performAction(self, contentList=None):
+        if contentList is None:
+            contentList = []
+
+        # run command and retrieve output
+        process = subprocess.Popen(self.command, stdout=subprocess.PIPE, shell=True, close_fds=True)
+        thefile = process.stdout
+        result = thefile.read().decode(self.encoding, errors='ignore')
+        thefile.close()
+
+        if process.wait() != 0:
+            raise Exception("process terminated with an error")
+
+        contentList.append(Content(uri=None, encoding=self.encoding, title=None, content=result, contenttype=self.contenttype))
+        return contentList
+
+
+class XPathParser(Parser):
+    def __init__(self, contentxpath, titlexpath=None):
+        self.contentxpath = contentxpath
+        self.titlexpath = titlexpath
+
+    # input: [Content], output: [Content]
+    def performAction(self, contentList):
+        result = []
+        for content in contentList:
+            result.extend(self.parseOneObject(content))
+        return result
+
+    # input: Content, output: [Content]
+    def parseOneObject(self, content):
+        baseuri = content.uri
+        if content.contenttype == 'html':
+            parser = etree.HTMLParser(encoding=content.encoding)
+        else:
+            parser = etree.XMLParser(recover=True, encoding=content.encoding)
+
+        tree = etree.fromstring(content.content, parser=parser)
+
+        # xpath
+        contentresult = [] if self.contentxpath is None else tree.xpath(self.contentxpath)
+        titleresult = [] if self.titlexpath is None else tree.xpath(self.titlexpath)
+
+        # translate relative URIs to absolute URIs
+        if content.contenttype == 'html':
+            basetaglist = tree.xpath('/html/head/base')
+            if len(basetaglist) != 0:
+                baseuri = basetaglist[0].attrib['href']
+            if len(contentresult) != 0:
+                toAbsoluteURIs(contentresult, baseuri)
+            if len(titleresult) != 0:
+                toAbsoluteURIs(titleresult, baseuri)
+
+        if self.contentxpath and len(contentresult) == 0:
+            raise Exception('WARNING: content selector became invalid!')
+        if self.titlexpath and len(titleresult) == 0:
+            raise Exception('WARNING: title selector became invalid!')
+
+        contents = []
+        titles = []
+        if isinstance(contentresult, str):
+            contents = [contentresult]
+        else:
+            if len(contentresult) == 0:
+                contentresult = titleresult
+            contents = [etree.tostring(s, encoding=content.encoding, pretty_print=True).decode(content.encoding, errors='ignore') for s in contentresult]
+
+        if isinstance(titleresult, str):
+            titles = [getSubject(titleresult)]*len(contents)
+        else:
+            if len(titleresult) == 0 or len(titleresult) != len(contentresult):
+                titleresult = contentresult
+            titles = [getSubject(etree.tostring(s, method='text', encoding=content.encoding).decode(content.encoding, errors='ignore')) for s in titleresult]
+
+        result = []
+        for i in range(0, len(contents)):
+            result.append(Content(uri=content.uri, encoding=content.encoding, title=titles[i], content=contents[i], contenttype=content.contenttype))
+
+        return result
+
+
+class CSSParser(Parser):
+    def __init__(self, contentcss, titlecss=None):
+        contentxpath = GenericTranslator().css_to_xpath(contentcss)
+        titlexpath = None
+        if titlecss is not None:
+            titlexpath = GenericTranslator().css_to_xpath(titlecss)
+
+        self.xpathparser = XPathParser(contentxpath=contentxpath, titlexpath=titlexpath)
+
+    # input: [Content], output: [Content]
+    def performAction(self, contentList):
+        return self.xpathparser.performAction(contentList)
+
+
+class RegExParser(Parser):
+    def __init__(self, contentregex, titleregex=None):
+        self.contentregex = contentregex
+        self.titleregex = titleregex
+
+    # input: [Content], output: [Content]
+    def performAction(self, contentList):
+        result = []
+        for content in contentList:
+            result.extend(self.parseOneObject(content))
+        return result
+
+    # input: Content, output: [Content]
+    def parseOneObject(self, content):
+        contents = []
+        titles = []
+        if self.contentregex is not None:
+            for c in re.findall(r'' + self.contentregex, content.content, re.M):
+                if len(c.strip()) != 0:
+                    contents.append(c)
+        if self.titleregex is not None:
+            for c in re.findall(r'' + self.titleregex, content.title, re.M):
+                if len(c.strip()) != 0:
+                    titles.append(c)
+
+        if self.contentregex is not None and len(contents) == 0:
+            raise Exception('WARNING: content regex became invalid!')
+        elif self.titleregex is not None and len(titles) == 0:
+            raise Exception('WARNING: title regex became invalid!')
+        else:
+            if len(contents) == 0:
+                contents = titles
+            if len(titles) == 0 or len(titles) != len(contents):
+                titles = [getSubject(c) for c in contents]
+
+        result = []
+        for i in range(0, len(contents)):
+            result.append(Content(uri=content.uri, encoding=content.encoding, title=titles[i], content=contents[i], contenttype=content.contenttype))
+
+        return result
+
-- 
cgit v1.2.3


From 85ce8edb9d5b92df39de1a22002bf29d779eeac2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= 
Date: Sun, 1 Oct 2017 19:25:58 +0200
Subject: Remove patches

---
 debian/changelog                |  7 ++-----
 debian/patches/0100-config.diff | 24 ------------------------
 debian/patches/series           |  6 +++---
 3 files changed, 5 insertions(+), 32 deletions(-)

diff --git a/debian/changelog b/debian/changelog
index 63449fb..8335f2d 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,10 +1,7 @@
-mwc (1.8.2-1) UNRELEASED; urgency=medium
+mwc (2.0.4-1) UNRELEASED; urgency=medium
 
   * New upstream release (Closes: #862004).
-    + Refresh patches.
-  * Renumbering patches.
-  * debian/patches/0105-try_mail.diff:
-    - Replace undefined printf with print (Closes: #860494).
+    + Remove now unuseable patches.
   * Rewrite debian/watch for archives without "v" in front of the version.
   * Declare compliance with Debian Policy 4.0.0. (No changes needed).
   * Bump compatlevel to 10 (no changes required):
diff --git a/debian/patches/0100-config.diff b/debian/patches/0100-config.diff
index 8529874..49bb589 100644
--- a/debian/patches/0100-config.diff
+++ b/debian/patches/0100-config.diff
@@ -5,30 +5,6 @@ Author: Jörg Frings-Fürst 
 Last-Update: 2014-05-12
 ---
 This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
-Index: trunk/config_template.py
-===================================================================
---- trunk.orig/config_template.py
-+++ trunk/config_template.py
-@@ -1,5 +1,3 @@
--import os.path
--
- # Copyright: (2013-2014) Michael Till Beck 
- # License: GPL-2.0+
- 
-@@ -47,11 +45,11 @@ sender = 'me@mymail.com'
- smtphost = 'mysmtpprovider.com'
- useTLS = True
- smtpport = 587
--smtpusername = sender
-+smtpusername = 'sender'
- smtppwd = 'mypassword'
- receiver = 'me2@mymail.com'
- 
--os.chdir('/var/cache/mwc')
-+datadir'/var/cache/mwc'
- 
- enableRSSFeed = True
- rssfile = 'feed.xml'
 Index: trunk/mwcfeedserver.py
 ===================================================================
 --- trunk.orig/mwcfeedserver.py
diff --git a/debian/patches/series b/debian/patches/series
index a06ba62..3ad4804 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,4 +1,4 @@
-0100-config.diff
-0105-try_mail.diff
-0110-syslog.diff
+#0100-config.diff
+#0105-try_mail.diff
+#0110-syslog.diff
 #0115-Add_header.patch
-- 
cgit v1.2.3


From 9a99fe51fb073a85169d4e50ca68cb0ec4531c8b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= 
Date: Sun, 1 Oct 2017 19:26:45 +0200
Subject: Declare compliance with Debian Policy 4.1.1

---
 debian/changelog | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/debian/changelog b/debian/changelog
index 8335f2d..13fc5f0 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -3,7 +3,7 @@ mwc (2.0.4-1) UNRELEASED; urgency=medium
   * New upstream release (Closes: #862004).
     + Remove now unuseable patches.
   * Rewrite debian/watch for archives without "v" in front of the version.
-  * Declare compliance with Debian Policy 4.0.0. (No changes needed).
+  * Declare compliance with Debian Policy 4.1.1. (No changes needed).
   * Bump compatlevel to 10 (no changes required):
     - Change debian/compat to 10.
     - At debian/control change requested version of debhelper to >= 10.
-- 
cgit v1.2.3


From 9a955c414e34de441b5f188520314d54e3c5b3c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= 
Date: Sun, 1 Oct 2017 20:08:51 +0200
Subject: remove patches; rewrite d/changelog

---
 .gitignore                           |  2 +-
 debian/changelog                     |  6 ++-
 debian/control                       |  2 +-
 debian/patches/0100-config.diff      | 88 ------------------------------------
 debian/patches/0105-try_mail.diff    | 52 ---------------------
 debian/patches/0110-syslog.diff      | 64 --------------------------
 debian/patches/0115-Add_header.patch | 50 --------------------
 debian/patches/series                |  4 --
 8 files changed, 6 insertions(+), 262 deletions(-)
 delete mode 100644 debian/patches/0100-config.diff
 delete mode 100644 debian/patches/0105-try_mail.diff
 delete mode 100644 debian/patches/0110-syslog.diff
 delete mode 100644 debian/patches/0115-Add_header.patch

diff --git a/.gitignore b/.gitignore
index 4ccd411..91ef9c0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,4 @@
 /*.txt
 *~
 *.pyc
-
+.pc
diff --git a/debian/changelog b/debian/changelog
index 13fc5f0..ad84848 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,7 +1,9 @@
 mwc (2.0.4-1) UNRELEASED; urgency=medium
 
-  * New upstream release (Closes: #862004).
-    + Remove now unuseable patches.
+  * New upstream release:
+    - Remove now unusable patches and therefore the printf function
+      (Closes: #860494).
+    - Add more Parameter needed at some websites (Closes: #862004).
   * Rewrite debian/watch for archives without "v" in front of the version.
   * Declare compliance with Debian Policy 4.1.1. (No changes needed).
   * Bump compatlevel to 10 (no changes required):
diff --git a/debian/control b/debian/control
index f38df7d..792ce2a 100644
--- a/debian/control
+++ b/debian/control
@@ -6,7 +6,7 @@ Build-Depends:
  debhelper (>= 10),
  dh-python,
  python3-all
-Standards-Version: 4.0.0
+Standards-Version: 4.1.1
 Homepage: https://github.com/Debianguru/MailWebsiteChanges
 Vcs-Git: git://anonscm.debian.org/collab-maint/mwc.git
 Vcs-Browser: https://anonscm.debian.org/cgit/collab-maint/mwc.git
diff --git a/debian/patches/0100-config.diff b/debian/patches/0100-config.diff
deleted file mode 100644
index 49bb589..0000000
--- a/debian/patches/0100-config.diff
+++ /dev/null
@@ -1,88 +0,0 @@
-Description: add loading config from every path
- Add loading config from ervery path
- Separation data / program
-Author: Jörg Frings-Fürst 
-Last-Update: 2014-05-12
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
-Index: trunk/mwcfeedserver.py
-===================================================================
---- trunk.orig/mwcfeedserver.py
-+++ trunk/mwcfeedserver.py
-@@ -6,6 +6,7 @@
- import http.server
- import socketserver
- import importlib
-+import os
- import sys
- import getopt
- 
-@@ -31,13 +32,26 @@ for opt, arg in opts:
-         elif opt in ('-p', '--port'):
-                 port = int(arg)
- 
--config = importlib.import_module(configMod)
--
-+#
-+# add code to load config from nonsystem path
-+# and change to datadir
-+#
-+try:
-+	path = os.path.dirname(configMod)
-+	fullname = os.path.basename(configMod)
-+	sys.path.append(path)
-+	config = importlib.import_module(fullname)
-+except:
-+	print('Error: loading config')
-+	sys.exit(2)
- 
- handler = http.server.SimpleHTTPRequestHandler
- 
- httpd = socketserver.TCPServer((bind, port), handler)
- 
- print('Bond to ' + bind + ', listening on port ' + str(port))
--httpd.serve_forever()
--
-+try:
-+	httpd.serve_forever()
-+except KeyboardInterrupt:
-+	pass
-+httpd.server_close()
-Index: trunk/mwc.py
-===================================================================
---- trunk.orig/mwc.py
-+++ trunk/mwc.py
-@@ -334,7 +334,7 @@ def pollWebsites():
- 
- if __name__ == "__main__":
- 
--    configMod = 'config'
-+    configMod = '/etc/mwc/mwc-config'
-     dryrun = None
- 
-     try:
-@@ -351,7 +351,23 @@ if __name__ == "__main__":
-         elif opt in ('-d', '--dry-run'):
-             dryrun = arg
- 
--    config = importlib.import_module(configMod)
-+    #
-+    # add code to load config from nonsystem path
-+    # and change to datadir
-+    #
-+    try:
-+        path = os.path.dirname(configMod)
-+        fullname = os.path.basename(configMod)
-+        sys.path.append(path)
-+        config = importlib.import_module(fullname)
-+    except:
-+        print('Error: loading config')
-+        sys.exit(2)
-+    try:
-+        os.chdir(config.datadir)
-+    except:
-+        print('Error: datadir not found')
-+        sys.exit(3)
- 
-     if dryrun:
-         for site in config.sites:
diff --git a/debian/patches/0105-try_mail.diff b/debian/patches/0105-try_mail.diff
deleted file mode 100644
index bc62ef1..0000000
--- a/debian/patches/0105-try_mail.diff
+++ /dev/null
@@ -1,52 +0,0 @@
-Description: try / except around mail functions
- add try / except around mail functions to
- prevent python errors messages
-Author: Jörg Frings-Fürst 
-Forwarded: via mail
-Applied-Upstream: 
-Reviewed-by: 
-Last-Update: 2014-05-22
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
-Index: trunk/mwc.py
-===================================================================
---- trunk.orig/mwc.py
-+++ trunk/mwc.py
-@@ -232,16 +232,28 @@ def sendmail(receiver, subject, content,
-     mail['Subject'] = Header(subject, defaultEncoding)
- 
-     # initialize session once, not each time this method gets called
--    if mailsession is None:
--        mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
--        if config.useTLS:
--            mailsession.ehlo()
--            mailsession.starttls()
--        if config.smtpusername is not None:
--            mailsession.login(config.smtpusername, config.smtppwd)
--
--    mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
- 
-+    #
-+    # add try / except to open mailsession
-+    #
-+    try:
-+	if mailsession is None:
-+	    mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
-+	    if config.useTLS:
-+            	mailsession.ehlo()
-+            	mailsession.starttls()
-+        	mailsession.login(config.smtpusername, config.smtppwd)
-+    #
-+    # add try / except to send mail
-+    #
-+    except:
-+    	print('Error: Open smtp-session')
-+    	exit(4)
-+    try:
-+    	mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
-+    except:
-+    	print('Error: sendmail')
-+    	exit(5)
- 
- # returns a list of all content that is stored locally for a specific site
- def getStoredHashes(shortname):
diff --git a/debian/patches/0110-syslog.diff b/debian/patches/0110-syslog.diff
deleted file mode 100644
index bd61d81..0000000
--- a/debian/patches/0110-syslog.diff
+++ /dev/null
@@ -1,64 +0,0 @@
-Description: add syslog messages on errors
-Author: Jörg Frings-Fürst 
-Forwarded: via mail
-Last-Update: 2014-05-22
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
-Index: trunk/mwc.py
-===================================================================
---- trunk.orig/mwc.py
-+++ trunk/mwc.py
-@@ -21,6 +21,7 @@ import os
- import sys
- import getopt
- import traceback
-+import syslog
- 
- import subprocess
- 
-@@ -248,11 +249,13 @@ def sendmail(receiver, subject, content,
-     #
-     except:
-     	print('Error: Open smtp-session')
-+	syslog.syslog(syslog.LOG_ERR, 'can not open smtp session')
-     	exit(4)
-     try:
-     	mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
-     except:
-     	print('Error: sendmail')
-+	syslog.syslog(syslog.LOG_ERR, 'error on sendmail')
-     	exit(5)
- 
- # returns a list of all content that is stored locally for a specific site
-@@ -349,6 +352,11 @@ if __name__ == "__main__":
-     configMod = '/etc/mwc/mwc-config'
-     dryrun = None
- 
-+    #
-+    # add syslog open
-+    #
-+    syslog.openlog()
-+
-     try:
-         opts, args = getopt.getopt(sys.argv[1:], 'hc:d:', ['help', 'config=', 'dry-run='])
-     except getopt.GetoptError:
-@@ -374,11 +382,13 @@ if __name__ == "__main__":
-         config = importlib.import_module(fullname)
-     except:
-         print('Error: loading config')
-+	syslog.syslog(syslog.LOG_ERR, 'can not found / load mwc-config')
-         sys.exit(2)
-     try:
-         os.chdir(config.datadir)
-     except:
-         print('Error: datadir not found')
-+	syslog.syslog(syslog.LOG_ERR, 'datadir not found')
-         sys.exit(3)
- 
-     if dryrun:
-@@ -400,3 +410,5 @@ if __name__ == "__main__":
-         if mailsession:
-             mailsession.quit()
-             mailsession = None
-+
-+    syslog.closelog()
diff --git a/debian/patches/0115-Add_header.patch b/debian/patches/0115-Add_header.patch
deleted file mode 100644
index 6ce0c15..0000000
--- a/debian/patches/0115-Add_header.patch
+++ /dev/null
@@ -1,50 +0,0 @@
-Description: Add Header Accept
-Author: Jörg Frings-Fürst 
-Bug-Debian: http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=862004
-Forwarded: https://github.com/Debianguru/MailWebsiteChanges/issues/11
-Last-Update: 2017-05-07
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
-Index: trunk/mwc.py
-===================================================================
---- trunk.orig/mwc.py
-+++ trunk/mwc.py
-@@ -91,6 +91,8 @@ def parseSite(site):
-                         req = urllib.request.Request(uri)
-                         if 'user-agent' in site:
-                             req.add_header('User-Agent', site['user-agent'])
-+                        if 'accept' in site:
-+                            req.add_header('Accept', site['accept'])
-                         file = urllib.request.urlopen(req)
- 
- 
-Index: trunk/README.md
-===================================================================
---- trunk.orig/README.md
-+++ trunk/README.md
-@@ -59,7 +59,9 @@ sites = [
-    * user-agent (optional)  
-      Defines the user agent string, e.g.,  
-      'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'
--
-+   * accept (optional)  
-+     Defines the accept string, e.g.,  
-+     'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
- 
-  * We collect some XPath/CSS snippets at this place: Snippet collection - please feel free to add your own definitions!
- 
-Index: trunk/config_template.py
-===================================================================
---- trunk.orig/config_template.py
-+++ trunk/config_template.py
-@@ -12,7 +12,9 @@ sites = [
-            'contentxpath': '//div',
-            'titleregex': '',
-            'contentregex': '',
--           'encoding': 'utf-8'},
-+           'encoding': 'utf-8',
-+           'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0',
-+           'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'},
- 
-           {'shortname': 'mywebsite2',
-            'uri': 'http://www.mywebsite2.com/info',
diff --git a/debian/patches/series b/debian/patches/series
index 3ad4804..e69de29 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,4 +0,0 @@
-#0100-config.diff
-#0105-try_mail.diff
-#0110-syslog.diff
-#0115-Add_header.patch
-- 
cgit v1.2.3


From 613720c29e3613baf3eb56511eab13195f4e8790 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= 
Date: Sun, 1 Oct 2017 20:24:36 +0200
Subject: d/changelog: change release, date/time to upload

---
 debian/changelog | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/debian/changelog b/debian/changelog
index ad84848..716efc8 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,4 +1,4 @@
-mwc (2.0.4-1) UNRELEASED; urgency=medium
+mwc (2.0.4-1) unstable; urgency=medium
 
   * New upstream release:
     - Remove now unusable patches and therefore the printf function
@@ -14,7 +14,7 @@ mwc (2.0.4-1) UNRELEASED; urgency=medium
     - Refresh copyright year at * and debian/*.
   * New README.source to explain the branching model used.
 
- -- Jörg Frings-Fürst   Sun, 06 Aug 2017 19:52:54 +0200
+ -- Jörg Frings-Fürst   Sun, 01 Oct 2017 20:21:11 +0200
 
 mwc (1.7.2-3) unstable; urgency=medium
 
-- 
cgit v1.2.3