From 76ea31d1747d8d95ec7ac75be750176beb452f66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Sun, 6 Aug 2017 19:52:14 +0200 Subject: New upstream version 1.7.6 --- README.md | 3 +++ config_template.py | 3 ++- mwc.py | 23 +++++++++++++++++++---- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d008527..8e78da6 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,8 @@ sites = [ Regular expression. If XPath/CSS selector is defined, the regular expression is applied afterwards. * encoding (optional; default: 'utf-8') Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'. + * splitregex (optional) + only works if type is set to 'text'; defines that content should be split to chunks based on the defined regex expression. * receiver (optional) Overrides global receiver specification. * user-agent (optional) @@ -79,6 +81,7 @@ sites = [
 
 enableMailNotifications = True   #enable/disable notification messages; if set to False, only send error messages
+maxMailsPerSession = -1   #max. number of mails to send per session; ignored when set to -1
 subjectPostfix = 'A website has been updated!'
 
 sender = 'me@mymail.com'
diff --git a/config_template.py b/config_template.py
index 02f7579..f394e52 100644
--- a/config_template.py
+++ b/config_template.py
@@ -15,7 +15,7 @@ sites = [
            'titleregex': '',
            'contentregex': '',
            'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0',
-           'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+           'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
            'encoding': 'utf-8'},
 
           {'shortname': 'mywebsite2',
@@ -42,6 +42,7 @@ sites = [
 subjectPostfix = 'A website has been updated!'
 
 enableMailNotifications = True
+maxMailsPerSession = -1
 sender = 'me@mymail.com'
 smtphost = 'mysmtpprovider.com'
 useTLS = True
diff --git a/mwc.py b/mwc.py
index a0635a1..c420a74 100755
--- a/mwc.py
+++ b/mwc.py
@@ -69,6 +69,7 @@ def parseSite(site):
         contenttype = site.get('type', 'html')
         contentregex = site.get('contentregex', '')
         titleregex = site.get('titleregex', '')
+        splitregex = site.get('splitregex', '')
         enc = site.get('encoding', defaultEncoding)
 
         contentxpath = site.get('contentxpath', '')
@@ -96,7 +97,10 @@ def parseSite(site):
 
 
                 if contenttype == 'text' or (contentxpath == '' and titlexpath == ''):
-                        contents = [file.read().decode(enc)]
+                        thefullcontent = file.read().decode(enc)
+                        contents = [thefullcontent]
+                        if splitregex != '':
+                                contents = thefullcontent.split(splitregex)
                         titles = []
                 else:
                         baseuri = uri
@@ -248,13 +252,13 @@ def getFileContents(shortname):
 
 
 # updates list of content that is stored locally for a specific site
-def storeFileContents(shortname, parseResult):
+def storeFileContents(shortname, contents):
         for f in os.listdir('.'):
                 if f.startswith(shortname + '.') and f.endswith('.txt'):
                         os.remove(f)
 
         i = 0
-        for c in parseResult['contents']:
+        for c in contents:
                 file = open(shortname + '.' + str(i) + '.txt', 'wb')
                 file.write(c.encode('utf-8'))
                 file.close()
@@ -271,7 +275,11 @@ def pollWebsites():
                         feedXML = etree.parse(io.StringIO(emptyfeed))
 
         # start polling sites
+        sessionContents = []
+        mailsSent = 0
         for site in config.sites:
+                if config.maxMailsPerSession != -1 and mailsSent >= config.maxMailsPerSession:
+                        break
 
                 print('polling site [' + site['shortname'] + '] ...')
                 parseResult = parseSite(site)
@@ -283,6 +291,7 @@ def pollWebsites():
                         print('WARNING: ' + parseResult['warning'])
                         if config.enableMailNotifications:
                                 sendmail(receiver, subject, parseResult['warning'], False, None)
+                                mailsSent = mailsSent + 1
                         if config.enableRSSFeed:
                                 feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0))
                 else:
@@ -291,13 +300,18 @@ def pollWebsites():
                         fileContents = getFileContents(site['shortname'])
                         i = 0
                         for content in parseResult['contents']:
+                                if config.maxMailsPerSession != -1 and mailsSent >= config.maxMailsPerSession:
+                                        break
+
                                 if content not in fileContents:
                                         changes += 1
+                                        sessionContents.append(content)
 
                                         subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i]
                                         print('    ' + subject)
                                         if config.enableMailNotifications and len(fileContents) > 0:
                                                 sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri'])
+                                                mailsSent = mailsSent + 1
 
                                         if config.enableRSSFeed:
                                                 feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes))
@@ -305,7 +319,7 @@ def pollWebsites():
 
 
                         if changes > 0:
-                                storeFileContents(site['shortname'], parseResult)
+                                storeFileContents(site['shortname'], sessionContents)
                                 print('        ' + str(changes) + ' updates')
  
         # store feed
@@ -343,6 +357,7 @@ if __name__ == "__main__":
                         if site['shortname'] == dryrun:
                                 parseResult = parseSite(site)
                                 print(parseResult)
+                                print(str(len(parseResult['contents'])) + " results")
                                 break
         else:
                 try:
-- 
cgit v1.2.3


From e121929e4aca8e1ce8167d5c7e0661ac48d6327d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= 
Date: Sun, 6 Aug 2017 19:55:54 +0200
Subject: New upstream release

---
 debian/changelog | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/debian/changelog b/debian/changelog
index 48ee0e8..44b9c3b 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,4 +1,4 @@
-mwc (1.7.5-1) UNRELEASED; urgency=medium
+mwc (1.7.6-1) UNRELEASED; urgency=medium
 
   * New upstream release.
   * Renumbering patches.
@@ -13,7 +13,7 @@ mwc (1.7.5-1) UNRELEASED; urgency=medium
   * debian/copyright:
     - Refresh copyright year at * and debian/*.
 
- -- Jörg Frings-Fürst   Tue, 18 Apr 2017 11:06:04 +0200
+ -- Jörg Frings-Fürst   Sun, 06 Aug 2017 19:52:54 +0200
 
 mwc (1.7.2-3) unstable; urgency=medium
 
-- 
cgit v1.2.3


From 2aa6f5ebd42d98f0d9a3b7d94076ac9b6ee1ba45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= 
Date: Sun, 6 Aug 2017 20:16:36 +0200
Subject: New README.source to explain the branching model used; Declare
 compliance with Debian Policy 4.0.0

---
 .pc/.dpkg-source-unapply |  0
 .pc/.quilt_patches       |  1 +
 .pc/.quilt_series        |  1 +
 .pc/.version             |  1 +
 debian/README.source     | 18 ++++++++++++++++++
 debian/changelog         |  3 ++-
 debian/control           |  2 +-
 7 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 .pc/.dpkg-source-unapply
 create mode 100644 .pc/.quilt_patches
 create mode 100644 .pc/.quilt_series
 create mode 100644 .pc/.version
 create mode 100644 debian/README.source

diff --git a/.pc/.dpkg-source-unapply b/.pc/.dpkg-source-unapply
new file mode 100644
index 0000000..e69de29
diff --git a/.pc/.quilt_patches b/.pc/.quilt_patches
new file mode 100644
index 0000000..6857a8d
--- /dev/null
+++ b/.pc/.quilt_patches
@@ -0,0 +1 @@
+debian/patches
diff --git a/.pc/.quilt_series b/.pc/.quilt_series
new file mode 100644
index 0000000..c206706
--- /dev/null
+++ b/.pc/.quilt_series
@@ -0,0 +1 @@
+series
diff --git a/.pc/.version b/.pc/.version
new file mode 100644
index 0000000..0cfbf08
--- /dev/null
+++ b/.pc/.version
@@ -0,0 +1 @@
+2
diff --git a/debian/README.source b/debian/README.source
new file mode 100644
index 0000000..e4f2b3d
--- /dev/null
+++ b/debian/README.source
@@ -0,0 +1,18 @@
+Hello,
+
+now I use the branching model from Vincent Driessen[1].
+
+I use the gitflow-avh[2]. with the Documentation[3].
+The Debian package can be found here[4].
+
+Please upload unattended uploads use a branch feature/.
+
+
+Many thanks.
+
+ -- Jörg Frings-Fürst   Fri, 02 Jun 2017 19:00:40 +0200
+
+[1] http://nvie.com/posts/a-successful-git-branching-model/
+[2] https://github.com/petervanderdoes/gitflow-avh
+[3] https://github.com/petervanderdoes/gitflow-avh/wiki
+[4] https://tracker.debian.org/pkg/git-flow
diff --git a/debian/changelog b/debian/changelog
index 44b9c3b..18767a8 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -5,13 +5,14 @@ mwc (1.7.6-1) UNRELEASED; urgency=medium
   * debian/patches/0105-try_mail.diff:
     - Replace undefined printf with print (Closes: #860494).
   * Rewrite debian/watch for archives without "v" in front of the version.
-  * Bump Standards-Version to 3.9.8.
+  * Declare compliance with Debian Policy 4.0.0. (No changes needed).
   * Bump compatlevel to 10 (no changes required):
     - Change debian/compat to 10.
     - At debian/control change requested version of debhelper to >= 10.
   * At debian/control change Vcs-Browser to secure URI.
   * debian/copyright:
     - Refresh copyright year at * and debian/*.
+  * New README.source to explain the branching model used.
 
  -- Jörg Frings-Fürst   Sun, 06 Aug 2017 19:52:54 +0200
 
diff --git a/debian/control b/debian/control
index 70dd2d3..f38df7d 100644
--- a/debian/control
+++ b/debian/control
@@ -6,7 +6,7 @@ Build-Depends:
  debhelper (>= 10),
  dh-python,
  python3-all
-Standards-Version: 3.9.8
+Standards-Version: 4.0.0
 Homepage: https://github.com/Debianguru/MailWebsiteChanges
 Vcs-Git: git://anonscm.debian.org/collab-maint/mwc.git
 Vcs-Browser: https://anonscm.debian.org/cgit/collab-maint/mwc.git
-- 
cgit v1.2.3


From 6c03e9d2fa808b9c5a223c4d01f4d0b848fe97f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= 
Date: Sun, 6 Aug 2017 20:17:43 +0200
Subject: Add .pc to .gitignore

---
 .gitignore               | 1 +
 .pc/.dpkg-source-unapply | 0
 .pc/.quilt_patches       | 1 -
 .pc/.quilt_series        | 1 -
 .pc/.version             | 1 -
 5 files changed, 1 insertion(+), 3 deletions(-)
 delete mode 100644 .pc/.dpkg-source-unapply
 delete mode 100644 .pc/.quilt_patches
 delete mode 100644 .pc/.quilt_series
 delete mode 100644 .pc/.version

diff --git a/.gitignore b/.gitignore
index 4ccd411..f945b3b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,5 @@
 /*.txt
 *~
 *.pyc
+.pc
 
diff --git a/.pc/.dpkg-source-unapply b/.pc/.dpkg-source-unapply
deleted file mode 100644
index e69de29..0000000
diff --git a/.pc/.quilt_patches b/.pc/.quilt_patches
deleted file mode 100644
index 6857a8d..0000000
--- a/.pc/.quilt_patches
+++ /dev/null
@@ -1 +0,0 @@
-debian/patches
diff --git a/.pc/.quilt_series b/.pc/.quilt_series
deleted file mode 100644
index c206706..0000000
--- a/.pc/.quilt_series
+++ /dev/null
@@ -1 +0,0 @@
-series
diff --git a/.pc/.version b/.pc/.version
deleted file mode 100644
index 0cfbf08..0000000
--- a/.pc/.version
+++ /dev/null
@@ -1 +0,0 @@
-2
-- 
cgit v1.2.3


From f8f939634396158de53fb26fa7f9a539a92fb219 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= 
Date: Fri, 11 Aug 2017 04:42:19 +0200
Subject: New upstream version 1.8.2

---
 mwc.py | 574 ++++++++++++++++++++++++++++++++---------------------------------
 1 file changed, 287 insertions(+), 287 deletions(-)

diff --git a/mwc.py b/mwc.py
index c420a74..4df4799 100755
--- a/mwc.py
+++ b/mwc.py
@@ -1,4 +1,5 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
 
 # Copyright: (2013-2015) Michael Till Beck 
 # License: GPL-2.0+
@@ -9,6 +10,7 @@ from lxml import etree
 from cssselect import GenericTranslator
 import re
 import io
+import hashlib
 
 import smtplib
 from email.mime.text import MIMEText
@@ -51,324 +53,322 @@ mailsession = None
 
 # translates all relative URIs found in trees to absolute URIs
 def toAbsoluteURIs(trees, baseuri):
-        for tree in trees:
-                if isinstance(tree, str):
-                        continue
-                for uriAttribute in uriAttributes:
-                        tags = tree.xpath(uriAttribute[0])
-                        for tag in tags:
-                                if tag.attrib.get(uriAttribute[1]) != None:
-                                        if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '':
-                                                tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]])
+    for tree in trees:
+        if isinstance(tree, str):
+            continue
+        for uriAttribute in uriAttributes:
+            tags = tree.xpath(uriAttribute[0])
+            for tag in tags:
+                if tag.attrib.get(uriAttribute[1]) != None:
+                    if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '':
+                        tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]])
 
 
 def parseSite(site):
-        file, content, titles, warning = None, None, None, None
-
-        uri = site['uri']
-        contenttype = site.get('type', 'html')
-        contentregex = site.get('contentregex', '')
-        titleregex = site.get('titleregex', '')
-        splitregex = site.get('splitregex', '')
-        enc = site.get('encoding', defaultEncoding)
-
-        contentxpath = site.get('contentxpath', '')
-        if contentxpath == '' and site.get('contentcss', '') != '':
-                # CSS
-                contentxpath = GenericTranslator().css_to_xpath(site.get('contentcss'))
-        titlexpath = site.get('titlexpath', '')
-        if titlexpath == '' and site.get('titlecss', '') != '':
-                titlexpath = GenericTranslator().css_to_xpath(site.get('titlecss'))
-
-        try:
-
-                if uri.startswith(cmdscheme):
-                        # run command and retrieve output
-                        process = subprocess.Popen(uri[len(cmdscheme):], stdout=subprocess.PIPE, shell=True, close_fds=True)
-                        file = process.stdout
-                else:
-                        # open website
-                        req = urllib.request.Request(uri)
-                        if 'user-agent' in site:
-                            req.add_header('User-Agent', site['user-agent'])
-                        if 'accept' in site:
-                            req.add_header('Accept', site['accept'])
-                        file = urllib.request.urlopen(req)
-
-
-                if contenttype == 'text' or (contentxpath == '' and titlexpath == ''):
-                        thefullcontent = file.read().decode(enc)
-                        contents = [thefullcontent]
-                        if splitregex != '':
-                                contents = thefullcontent.split(splitregex)
-                        titles = []
-                else:
-                        baseuri = uri
-                        if contenttype == 'html':
-                                parser = etree.HTMLParser(encoding=enc)
-                        else:
-                                parser = etree.XMLParser(recover=True, encoding=enc)
-
-                        tree = etree.parse(file, parser)
-
-                        # xpath
-                        contentresult = tree.xpath(contentxpath) if contentxpath else []
-                        titleresult = tree.xpath(titlexpath) if titlexpath else []
-
-                        # translate relative URIs to absolute URIs
-                        if contenttype == 'html':
-                                basetaglist = tree.xpath('/html/head/base')
-                                if len(basetaglist) != 0:
-                                        baseuri = basetaglist[0].attrib['href']
-                                if len(contentresult) != 0:
-                                        toAbsoluteURIs(contentresult, baseuri)
-                                if len(titleresult) != 0:
-                                        toAbsoluteURIs(titleresult, baseuri)
-
-                        if contentxpath != '' and titlexpath != '' and len(contentresult) != len(titleresult):
-                                warning = 'WARNING: number of title blocks (' + str(len(titleresult)) + ') does not match number of content blocks (' + str(len(contentresult)) + ')'
-                        elif contentxpath and len(contentresult) == 0:
-                                warning = 'WARNING: content selector became invalid!'
-                        elif titlexpath and len(titleresult) == 0:
-                                warning = 'WARNING: title selector became invalid!'
-                        else:
-                                if len(contentresult) == 0:
-                                        contentresult = titleresult
-                                if len(titleresult) == 0:
-                                        titleresult = contentresult
-
-                        if isinstance(contentresult, str):
-                                contents = [contentresult]
-                        else:
-                                contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult]
-                        if isinstance(titleresult, str):
-                                titles = [getSubject(titleresult)]
-                        else:
-                                titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult]
-
-        except IOError as e:
-                warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e)
-
-        if file is not None:
-                file.close()
-
-        if uri.startswith(cmdscheme) and process.wait() != 0:
-                warning = 'WARNING: process terminated with an error'
-
-        if warning:
-                return {'content': content, 'titles': titles, 'warning': warning}
-
-        # parse regex
-        if contentregex:
-                contents = [x for y in [re.findall(r'' + contentregex, c, re.S) for c in contents] for x in y]
-        if titleregex:
-                titles = [x for y in [re.findall(r'' + titleregex, c, re.S) for c in titles] for x in y]
-
-        if contentregex and titleregex and len(contents) != len(titles):
-                warning = 'WARNING: number of title blocks (' + str(len(titles)) + ') does not match number of content blocks (' + str(len(contents)) + ') after regex'
-        elif contentregex and len(contents) == 0:
-                warning = 'WARNING: content regex became invalid!'
-        elif titleregex and len(titles) == 0:
-                warning = 'WARNING: title regex became invalid!'
+    global defaultEncoding
+    file, content, titles, warning = None, None, None, None
+
+    uri = site['uri']
+    contenttype = site.get('type', 'html')
+    contentregex = site.get('contentregex', '')
+    titleregex = site.get('titleregex', '')
+    splitregex = site.get('splitregex', '')
+    enc = site.get('encoding', defaultEncoding)
+
+    contentxpath = site.get('contentxpath', '')
+    if contentxpath == '' and site.get('contentcss', '') != '':
+        # CSS
+        contentxpath = GenericTranslator().css_to_xpath(site.get('contentcss'))
+    titlexpath = site.get('titlexpath', '')
+    if titlexpath == '' and site.get('titlecss', '') != '':
+        titlexpath = GenericTranslator().css_to_xpath(site.get('titlecss'))
+
+    try:
+
+        if uri.startswith(cmdscheme):
+            # run command and retrieve output
+            process = subprocess.Popen(uri[len(cmdscheme):], stdout=subprocess.PIPE, shell=True, close_fds=True)
+            file = process.stdout
         else:
-                if len(contents) == 0:
-                        contents = titles
-                if len(titles) == 0:
-                        titles = [getSubject(c) for c in contents]
-
-        return {'contents': contents, 'titles': titles, 'warning': warning}
+            # open website
+            req = urllib.request.Request(uri)
+            if 'user-agent' in site:
+                req.add_header('User-Agent', site['user-agent'])
+            if 'accept' in site:
+                req.add_header('Accept', site['accept'])
+            file = urllib.request.urlopen(req)
+
+
+        if contenttype == 'text' or (contentxpath == '' and titlexpath == ''):
+            thefullcontent = file.read().decode(enc, errors='ignore')
+            contents = [thefullcontent]
+            if splitregex != '':
+                contents = thefullcontent.split(splitregex)
+            titles = []
+        else:
+            baseuri = uri
+            if contenttype == 'html':
+                parser = etree.HTMLParser(encoding=enc)
+            else:
+                parser = etree.XMLParser(recover=True, encoding=enc)
+
+            tree = etree.parse(file, parser)
+
+            # xpath
+            contentresult = tree.xpath(contentxpath) if contentxpath else []
+            titleresult = tree.xpath(titlexpath) if titlexpath else []
+
+            # translate relative URIs to absolute URIs
+            if contenttype == 'html':
+                basetaglist = tree.xpath('/html/head/base')
+                if len(basetaglist) != 0:
+                    baseuri = basetaglist[0].attrib['href']
+                if len(contentresult) != 0:
+                    toAbsoluteURIs(contentresult, baseuri)
+                if len(titleresult) != 0:
+                    toAbsoluteURIs(titleresult, baseuri)
+
+            if contentxpath != '' and titlexpath != '' and len(contentresult) != len(titleresult):
+                warning = 'WARNING: number of title blocks (' + str(len(titleresult)) + ') does not match number of content blocks (' + str(len(contentresult)) + ')'
+            elif contentxpath and len(contentresult) == 0:
+                warning = 'WARNING: content selector became invalid!'
+            elif titlexpath and len(titleresult) == 0:
+                warning = 'WARNING: title selector became invalid!'
+            else:
+                if len(contentresult) == 0:
+                    contentresult = titleresult
+                if len(titleresult) == 0:
+                    titleresult = contentresult
+
+            if isinstance(contentresult, str):
+                contents = [contentresult]
+            else:
+                contents = [etree.tostring(s, encoding=enc, pretty_print=True).decode(enc, errors='ignore') for s in contentresult]
+            if isinstance(titleresult, str):
+                titles = [getSubject(titleresult)]
+            else:
+                titles = [getSubject(etree.tostring(s, method='text', encoding=enc).decode(enc, errors='ignore')) for s in titleresult]
+
+    except IOError as e:
+        warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e)
+
+    if file is not None:
+        file.close()
+
+    if uri.startswith(cmdscheme) and process.wait() != 0:
+        warning = 'WARNING: process terminated with an error'
+
+    if warning:
+        return {'content': content, 'titles': titles, 'warning': warning}
+
+    # parse regex
+    if contentregex:
+        contents = [x for y in [re.findall(r'' + contentregex, c, re.S) for c in contents] for x in y]
+    if titleregex:
+        titles = [x for y in [re.findall(r'' + titleregex, c, re.S) for c in titles] for x in y]
+
+    if contentregex and titleregex and len(contents) != len(titles):
+        warning = 'WARNING: number of title blocks (' + str(len(titles)) + ') does not match number of content blocks (' + str(len(contents)) + ') after regex'
+    elif contentregex and len(contents) == 0:
+        warning = 'WARNING: content regex became invalid!'
+    elif titleregex and len(titles) == 0:
+        warning = 'WARNING: title regex became invalid!'
+    else:
+        if len(contents) == 0:
+            contents = titles
+        if len(titles) == 0:
+            titles = [getSubject(c) for c in contents]
+
+    return {'contents': contents, 'titles': titles, 'warning': warning}
 
 
 # returns a short subject line
 def getSubject(textContent):
-        if textContent == None or textContent == '':
-                return config.subjectPostfix
-        textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip()
-        return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent
+    if textContent == None or textContent == '':
+        return config.subjectPostfix
+    textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip()
+    return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent
 
 
 # generates a new RSS feed item
 def genFeedItem(subject, content, link, change):
-        feeditem = etree.Element('item')
-        titleitem = etree.Element('title')
-        titleitem.text = subject + ' #' + str(change)
-        feeditem.append(titleitem)
-        linkitem = etree.Element('link')
-        linkitem.text = link
-        feeditem.append(linkitem)
-        descriptionitem = etree.Element('description')
-        descriptionitem.text = content
-        feeditem.append(descriptionitem)
-        guiditem = etree.Element('guid')
-        guiditem.text = str(random.getrandbits(32))
-        feeditem.append(guiditem)
-        dateitem = etree.Element('pubDate')
-        dateitem.text = strftime("%a, %d %b %Y %H:%M:%S %Z", time.localtime())
-        feeditem.append(dateitem)
-
-        return feeditem
+    feeditem = etree.Element('item')
+    titleitem = etree.Element('title')
+    titleitem.text = subject + ' #' + str(change)
+    feeditem.append(titleitem)
+    linkitem = etree.Element('link')
+    linkitem.text = link
+    feeditem.append(linkitem)
+    descriptionitem = etree.Element('description')
+    descriptionitem.text = content
+    feeditem.append(descriptionitem)
+    guiditem = etree.Element('guid')
+    guiditem.text = str(random.getrandbits(32))
+    feeditem.append(guiditem)
+    dateitem = etree.Element('pubDate')
+    dateitem.text = strftime("%a, %d %b %Y %H:%M:%S %Z", time.localtime())
+    feeditem.append(dateitem)
+
+    return feeditem
 
 
 # sends mail notification
 def sendmail(receiver, subject, content, sendAsHtml, link):
-        global mailsession
-
-        if sendAsHtml:
-                baseurl = None
-                if link != None:
-                        content = '

' + subject + '

\n' + content - baseurl = urljoin(link, '/') - mail = MIMEText('' + subject + '' + ('' if baseurl else '') + '' + content + '', 'html', defaultEncoding) - else: - if link != None: - content = link + '\n\n' + content - mail = MIMEText(content, 'text', defaultEncoding) + global mailsession, defaultEncoding + + if sendAsHtml: + baseurl = None + if link != None: + content = '

' + subject + '

\n' + content + baseurl = urljoin(link, '/') + mail = MIMEText('' + subject + '' + ('' if baseurl else '') + '' + content + '', 'html', defaultEncoding) + else: + if link != None: + content = link + '\n\n' + content + mail = MIMEText(content, 'text', defaultEncoding) + + mail['From'] = config.sender + mail['To'] = receiver + mail['Subject'] = Header(subject, defaultEncoding) + + # initialize session once, not each time this method gets called + if mailsession is None: + mailsession = smtplib.SMTP(config.smtphost, config.smtpport) + if config.useTLS: + mailsession.ehlo() + mailsession.starttls() + if config.smtpusername is not None: + mailsession.login(config.smtpusername, config.smtppwd) + + mailsession.sendmail(config.sender, receiver.split(','), mail.as_string()) - mail['From'] = config.sender - mail['To'] = receiver - mail['Subject'] = Header(subject, defaultEncoding) - # initialize session once, not each time this method gets called - if mailsession is None: - mailsession = smtplib.SMTP(config.smtphost, config.smtpport) - if config.useTLS: - mailsession.ehlo() - mailsession.starttls() - if config.smtpusername is not None: - mailsession.login(config.smtpusername, config.smtppwd) - - mailsession.sendmail(config.sender, receiver.split(','), mail.as_string()) +# returns a list of all content that is stored locally for a specific site +def getStoredHashes(shortname): + result = [] + filename = shortname + ".txt" + if os.path.exists(filename): + with open(filename, 'r') as file: + for line in file: + result.append(line.rstrip()) -# returns a list of all content that is stored locally for a specific site -def getFileContents(shortname): - result = [] - for f in os.listdir('.'): - if f.startswith(shortname + '.') and f.endswith('.txt'): - file = open(f, 'rb') - result.append(file.read().decode('utf-8')) - file.close() - return result + return result # updates list of content that is stored locally for a specific site -def storeFileContents(shortname, contents): - for f in os.listdir('.'): - if f.startswith(shortname + '.') and f.endswith('.txt'): - os.remove(f) - - i = 0 - for c in contents: - file = open(shortname + '.' + str(i) + '.txt', 'wb') - file.write(c.encode('utf-8')) - file.close() - i += 1 +def storeHashes(shortname, contentHashes): + with open(shortname + '.txt', 'w') as file: + for h in contentHashes: + file.write(h + "\n") -def pollWebsites(): - # parse existing feed or create a new one - if config.enableRSSFeed: - if os.path.isfile(config.rssfile): - feedXML = etree.parse(config.rssfile) - else: - feedXML = etree.parse(io.StringIO(emptyfeed)) +def pollWebsites(): + global defaultEncoding - # start polling sites - sessionContents = [] - mailsSent = 0 - for site in config.sites: - if config.maxMailsPerSession != -1 and mailsSent >= config.maxMailsPerSession: - break + # parse existing feed or create a new one + if config.enableRSSFeed: + if os.path.isfile(config.rssfile): + feedXML = etree.parse(config.rssfile) + else: + feedXML = etree.parse(io.StringIO(emptyfeed)) + + # start polling sites + mailsSent = 0 + for site in config.sites: + print('polling site [' + site['shortname'] + '] ...') + sessionHashes = [] + parseResult = parseSite(site) + receiver = site.get('receiver', config.receiver) + + # if something went wrong, notify the user + if parseResult['warning']: + subject = '[' + site['shortname'] + '] WARNING' + print('WARNING: ' + parseResult['warning']) + if config.enableMailNotifications: + if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession: + sendmail(receiver, subject, parseResult['warning'], False, None) + mailsSent = mailsSent + 1 + if config.enableRSSFeed: + feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0)) + else: + # otherwise, check which parts of the site were updated + changes = 0 + fileHashes = getStoredHashes(site['shortname']) + i = 0 + for content in parseResult['contents']: + + contenthash = hashlib.md5(content.encode(defaultEncoding)).hexdigest() + if contenthash not in fileHashes: + if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession: + changes += 1 + sessionHashes.append(contenthash) + + subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i] + print(' ' + subject) + if config.enableMailNotifications and len(fileHashes) > 0: + sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri']) + mailsSent = mailsSent + 1 - print('polling site [' + site['shortname'] + '] ...') - parseResult = parseSite(site) - receiver = site.get('receiver', config.receiver) - - # if something went wrong, notify the user - if parseResult['warning']: - subject = '[' + site['shortname'] + '] WARNING' - print('WARNING: ' + parseResult['warning']) - if config.enableMailNotifications: - sendmail(receiver, subject, parseResult['warning'], False, None) - mailsSent = mailsSent + 1 if config.enableRSSFeed: - feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0)) + feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes)) else: - # otherwise, check which parts of the site were updated - changes = 0 - fileContents = getFileContents(site['shortname']) - i = 0 - for content in parseResult['contents']: - if config.maxMailsPerSession != -1 and mailsSent >= config.maxMailsPerSession: - break - - if content not in fileContents: - changes += 1 - sessionContents.append(content) - - subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i] - print(' ' + subject) - if config.enableMailNotifications and len(fileContents) > 0: - sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri']) - mailsSent = mailsSent + 1 - - if config.enableRSSFeed: - feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes)) - i += 1 - - - if changes > 0: - storeFileContents(site['shortname'], sessionContents) - print(' ' + str(changes) + ' updates') - - # store feed - if config.enableRSSFeed: - for o in feedXML.xpath('//channel/item[position() 0: + storeHashes(site['shortname'], sessionHashes) + print(' ' + str(changes) + ' updates') - try: - opts, args = getopt.getopt(sys.argv[1:], 'hc:d:', ['help', 'config=', 'dry-run=']) - except getopt.GetoptError: - print('Usage: mwc.py --config=config --dry-run=shortname') - sys.exit(1) - for opt, arg in opts: - if opt == '-h': - print('Usage: mwc.py --config=config') - exit() - elif opt in ('-c', '--config'): - configMod = arg - elif opt in ('-d', '--dry-run'): - dryrun = arg - - config = importlib.import_module(configMod) - - if dryrun: - for site in config.sites: - if site['shortname'] == dryrun: - parseResult = parseSite(site) - print(parseResult) - print(str(len(parseResult['contents'])) + " results") - break - else: - try: - pollWebsites() - except: - msg = str(sys.exc_info()[0]) + '\n\n' + traceback.format_exc() - print(msg) - if config.receiver != '': - sendmail(config.receiver, '[mwc] Something went wrong ...', msg, False, None) - - if mailsession: - mailsession.quit() - mailsession = None + # store feed + if config.enableRSSFeed: + for o in feedXML.xpath('//channel/item[position() Date: Fri, 11 Aug 2017 05:41:14 +0200 Subject: New upstream release; refresh patches --- debian/changelog | 5 +- debian/patches/0100-config.diff | 81 +++++++++++++------------- debian/patches/0105-try_mail.diff | 68 +++++++++++----------- debian/patches/0110-syslog.diff | 118 ++++++++++++++------------------------ 4 files changed, 119 insertions(+), 153 deletions(-) diff --git a/debian/changelog b/debian/changelog index 18767a8..63449fb 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,6 +1,7 @@ -mwc (1.7.6-1) UNRELEASED; urgency=medium +mwc (1.8.2-1) UNRELEASED; urgency=medium - * New upstream release. + * New upstream release (Closes: #862004). + + Refresh patches. * Renumbering patches. * debian/patches/0105-try_mail.diff: - Replace undefined printf with print (Closes: #860494). diff --git a/debian/patches/0100-config.diff b/debian/patches/0100-config.diff index ce4dba7..8529874 100644 --- a/debian/patches/0100-config.diff +++ b/debian/patches/0100-config.diff @@ -5,47 +5,6 @@ Author: Jörg Frings-Fürst Last-Update: 2014-05-12 --- This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ -Index: trunk/mwc.py -=================================================================== ---- trunk.orig/mwc.py -+++ trunk/mwc.py -@@ -319,7 +319,7 @@ def pollWebsites(): - - if __name__ == "__main__": - -- configMod = 'config' -+ configMod = '/etc/mwc/mwc-config' - dryrun = None - - try: -@@ -335,9 +335,24 @@ if __name__ == "__main__": - configMod = arg - elif opt in ('-d', '--dry-run'): - dryrun = arg -- -- config = importlib.import_module(configMod) -- -+ # -+ # add code to load config from nonsystem path -+ # and change to datadir -+ # -+ try: -+ path = os.path.dirname(configMod) -+ fullname = os.path.basename(configMod) -+ sys.path.append(path) -+ config = importlib.import_module(fullname) -+ except: -+ print('Error: loading config') -+ sys.exit(2) -+ try: -+ os.chdir(config.datadir) -+ except: -+ print('Error: datadir not found') -+ sys.exit(3) -+ - if dryrun: - for site in config.sites: - if site['shortname'] == dryrun: Index: trunk/config_template.py =================================================================== --- trunk.orig/config_template.py @@ -56,7 +15,7 @@ Index: trunk/config_template.py # Copyright: (2013-2014) Michael Till Beck # License: GPL-2.0+ -@@ -46,11 +44,11 @@ sender = 'me@mymail.com' +@@ -47,11 +45,11 @@ sender = 'me@mymail.com' smtphost = 'mysmtpprovider.com' useTLS = True smtpport = 587 @@ -113,3 +72,41 @@ Index: trunk/mwcfeedserver.py +except KeyboardInterrupt: + pass +httpd.server_close() +Index: trunk/mwc.py +=================================================================== +--- trunk.orig/mwc.py ++++ trunk/mwc.py +@@ -334,7 +334,7 @@ def pollWebsites(): + + if __name__ == "__main__": + +- configMod = 'config' ++ configMod = '/etc/mwc/mwc-config' + dryrun = None + + try: +@@ -351,7 +351,23 @@ if __name__ == "__main__": + elif opt in ('-d', '--dry-run'): + dryrun = arg + +- config = importlib.import_module(configMod) ++ # ++ # add code to load config from nonsystem path ++ # and change to datadir ++ # ++ try: ++ path = os.path.dirname(configMod) ++ fullname = os.path.basename(configMod) ++ sys.path.append(path) ++ config = importlib.import_module(fullname) ++ except: ++ print('Error: loading config') ++ sys.exit(2) ++ try: ++ os.chdir(config.datadir) ++ except: ++ print('Error: datadir not found') ++ sys.exit(3) + + if dryrun: + for site in config.sites: diff --git a/debian/patches/0105-try_mail.diff b/debian/patches/0105-try_mail.diff index d390b6d..bc62ef1 100644 --- a/debian/patches/0105-try_mail.diff +++ b/debian/patches/0105-try_mail.diff @@ -12,41 +12,41 @@ Index: trunk/mwc.py =================================================================== --- trunk.orig/mwc.py +++ trunk/mwc.py -@@ -225,16 +225,27 @@ def sendmail(receiver, subject, content, - mail['Subject'] = Header(subject, defaultEncoding) +@@ -232,16 +232,28 @@ def sendmail(receiver, subject, content, + mail['Subject'] = Header(subject, defaultEncoding) - # initialize session once, not each time this method gets called -- if mailsession is None: -- mailsession = smtplib.SMTP(config.smtphost, config.smtpport) -- if config.useTLS: -- mailsession.ehlo() -- mailsession.starttls() -- if config.smtpusername is not None: -- mailsession.login(config.smtpusername, config.smtppwd) + # initialize session once, not each time this method gets called +- if mailsession is None: +- mailsession = smtplib.SMTP(config.smtphost, config.smtpport) +- if config.useTLS: +- mailsession.ehlo() +- mailsession.starttls() +- if config.smtpusername is not None: +- mailsession.login(config.smtpusername, config.smtppwd) - -- mailsession.sendmail(config.sender, receiver.split(','), mail.as_string()) -- -+ # -+ # add try / except to open mailsession -+ # -+ try: -+ if mailsession is None: -+ mailsession = smtplib.SMTP(config.smtphost, config.smtpport) -+ if config.useTLS: -+ mailsession.ehlo() -+ mailsession.starttls() -+ mailsession.login(config.smtpusername, config.smtppwd) -+ # -+ # add try / except to send mail -+ # -+ except: -+ print('Error: Open smtp-session') -+ exit(4) -+ try: -+ mailsession.sendmail(config.sender, receiver.split(','), mail.as_string()) -+ except: -+ print('Error: sendmail') -+ exit(5) +- mailsession.sendmail(config.sender, receiver.split(','), mail.as_string()) + ++ # ++ # add try / except to open mailsession ++ # ++ try: ++ if mailsession is None: ++ mailsession = smtplib.SMTP(config.smtphost, config.smtpport) ++ if config.useTLS: ++ mailsession.ehlo() ++ mailsession.starttls() ++ mailsession.login(config.smtpusername, config.smtppwd) ++ # ++ # add try / except to send mail ++ # ++ except: ++ print('Error: Open smtp-session') ++ exit(4) ++ try: ++ mailsession.sendmail(config.sender, receiver.split(','), mail.as_string()) ++ except: ++ print('Error: sendmail') ++ exit(5) # returns a list of all content that is stored locally for a specific site - def getFileContents(shortname): + def getStoredHashes(shortname): diff --git a/debian/patches/0110-syslog.diff b/debian/patches/0110-syslog.diff index 12d629d..bd61d81 100644 --- a/debian/patches/0110-syslog.diff +++ b/debian/patches/0110-syslog.diff @@ -8,7 +8,7 @@ Index: trunk/mwc.py =================================================================== --- trunk.orig/mwc.py +++ trunk/mwc.py -@@ -19,6 +19,7 @@ import os +@@ -21,6 +21,7 @@ import os import sys import getopt import traceback @@ -16,81 +16,49 @@ Index: trunk/mwc.py import subprocess -@@ -227,25 +228,28 @@ def sendmail(receiver, subject, content, - # initialize session once, not each time this method gets called - # - # add try / except to open mailsession -- # -+ # -+ - try: -- if mailsession is None: -- mailsession = smtplib.SMTP(config.smtphost, config.smtpport) -- if config.useTLS: -- mailsession.ehlo() -- mailsession.starttls() -- mailsession.login(config.smtpusername, config.smtppwd) -- # -+ if mailsession is None: -+ mailsession = smtplib.SMTP(config.smtphost, config.smtpport) -+ if config.useTLS: -+ mailsession.ehlo() -+ mailsession.starttls() -+ mailsession.login(config.smtpusername, config.smtppwd) -+ except: -+ print('Error: Open smtp-session') -+ syslog.syslog(syslog.LOG_ERR, 'can not open smtp session') -+ exit(4) -+ # - # add try / except to send mail - # -+ try: -+ mailsession.sendmail(config.sender, receiver.split(','), mail.as_string()) - except: -- print('Error: Open smtp-session') -- exit(4) -- try: -- mailsession.sendmail(config.sender, receiver.split(','), mail.as_string()) -- except: -- print('Error: sendmail') -- exit(5) -+ print('Error: sendmail') -+ syslog.syslog(syslog.LOG_ERR, 'error on sendmail') -+ exit(5) +@@ -248,11 +249,13 @@ def sendmail(receiver, subject, content, + # + except: + print('Error: Open smtp-session') ++ syslog.syslog(syslog.LOG_ERR, 'can not open smtp session') + exit(4) + try: + mailsession.sendmail(config.sender, receiver.split(','), mail.as_string()) + except: + print('Error: sendmail') ++ syslog.syslog(syslog.LOG_ERR, 'error on sendmail') + exit(5) # returns a list of all content that is stored locally for a specific site - def getFileContents(shortname): -@@ -332,7 +336,11 @@ if __name__ == "__main__": +@@ -349,6 +352,11 @@ if __name__ == "__main__": + configMod = '/etc/mwc/mwc-config' + dryrun = None - configMod = '/etc/mwc/mwc-config' - dryrun = None -- -+ -+ # -+ # add syslog open -+ # -+ syslog.openlog() - try: - opts, args = getopt.getopt(sys.argv[1:], 'hc:d:', ['help', 'config=', 'dry-run=']) - except getopt.GetoptError: -@@ -357,11 +365,13 @@ if __name__ == "__main__": - config = importlib.import_module(fullname) - except: - print('Error: loading config') -+ syslog.syslog(syslog.LOG_ERR, 'can not found / load mwc-config') - sys.exit(2) - try: - os.chdir(config.datadir) - except: - print('Error: datadir not found') -+ syslog.syslog(syslog.LOG_ERR, 'datadir not found') - sys.exit(3) - - if dryrun: -@@ -383,3 +393,5 @@ if __name__ == "__main__": - mailsession.quit() - mailsession = None ++ # ++ # add syslog open ++ # ++ syslog.openlog() ++ + try: + opts, args = getopt.getopt(sys.argv[1:], 'hc:d:', ['help', 'config=', 'dry-run=']) + except getopt.GetoptError: +@@ -374,11 +382,13 @@ if __name__ == "__main__": + config = importlib.import_module(fullname) + except: + print('Error: loading config') ++ syslog.syslog(syslog.LOG_ERR, 'can not found / load mwc-config') + sys.exit(2) + try: + os.chdir(config.datadir) + except: + print('Error: datadir not found') ++ syslog.syslog(syslog.LOG_ERR, 'datadir not found') + sys.exit(3) -+ syslog.closelog() -+ -\ No newline at end of file + if dryrun: +@@ -400,3 +410,5 @@ if __name__ == "__main__": + if mailsession: + mailsession.quit() + mailsession = None ++ ++ syslog.closelog() -- cgit v1.2.3 From 04b13e003d6af0de21e6c59e411ffee5b97b6134 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Sun, 1 Oct 2017 18:50:17 +0200 Subject: New upstream version 2.0.4 --- README.md | 89 ++++++++++------ config_template.py | 74 ++++++------- mwc.py | 298 +++++++++++++++-------------------------------------- mwcfeedserver.py | 31 +++--- mwctools.py | 239 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 433 insertions(+), 298 deletions(-) mode change 100644 => 100755 config_template.py create mode 100755 mwctools.py diff --git a/README.md b/README.md index 8e78da6..69718e7 100644 --- a/README.md +++ b/README.md @@ -19,20 +19,23 @@ Some examples: sites = [ - {'shortname': 'mywebsite1', - 'uri': 'http://www.mywebsite1.com/info', - 'contentcss': 'div'}, - - {'shortname': 'mywebsite2', - 'uri': 'http://www.mywebsite2.com/info', - 'contentxpath': '//*[contains(concat(\' \', normalize-space(@class), \' \'), \' news-list-container \')]', - 'titlexpath': '//title'}, - - {'shortname': 'mywebsite3', - 'uri': 'http://www.mywebsite3.com/info', - 'type': 'text', - 'contentregex': 'Version\"\:\d*\.\d*', - 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'} + {'name': 'example-css', + 'parsers': [uri(uri='https://github.com/mtill', contenttype='html'), + css(contentcss='div') + ] + }, + + {'name': 'example-xpath', + 'parsers': [uri(uri='https://example-webpage.com/test', contenttype='html'), + xpath(contentxpath='//div[contains(concat(\' \', normalize-space(@class), \' \'), \' package-version-header \')]') + ] + }, + + {'name': 'my-script', + 'parsers': [command(command='/home/user/script.sh', contenttype='text'), + regex(contentregex='^.*$') + ] + } ] @@ -40,31 +43,55 @@ sites = [ * parameters: - * shortname - short name of the entry, used as an identifier when sending email notifications + * name + name of the entry, used as an identifier when sending email notifications + * receiver (optional) + Overrides global receiver specification. + + * parameters for the URL receiver: + * uri - URI of the website; If the scheme of the uri is 'cmd://', the string is interpreted as a command and the standard output (stdout) is parsed. - * type (optional; default: 'html') + URI of the website + * contenttype (optional; default: 'html') content type, e.g., 'xml'/'html'/'text'. - * contentxpath / titlexpath (optional) - XPath expression for the content/title sections to extract. If you prefer, you could use contentcss/titlecss instead. - * contentcss / titlecss (optional) - CSS expression for the content/title sections to extract. This is ignored if there is a corresponding XPath definition. - * contentregex / titleregex (optional) - Regular expression. If XPath/CSS selector is defined, the regular expression is applied afterwards. - * encoding (optional; default: 'utf-8') + * enc (optional; default: 'utf-8') Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'. - * splitregex (optional) - only works if type is set to 'text'; defines that content should be split to chunks based on the defined regex expression. - * receiver (optional) - Overrides global receiver specification. - * user-agent (optional) + * userAgent (optional) Defines the user agent string, e.g., - 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0' + 'userAgent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0' * accept (optional) Defines the accept string, e.g., 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' + * parameters for the Command receiver + + * command + the command + * contenttype (optional; default: 'text') + content type, e.g., 'xml'/'html'/'text'. + * enc (optional; default: 'utf-8') + Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'. + + * parameters for the XPath parser: + + * contentxpath + XPath expression for the content sections to extract + * titlexpath (optional) + XPath expression for the title sections to extract + + * parameters for the CSS parser: + + * contentcss + CSS expression for the content sections to extract + * titlecss (optional) + CSS expression for the title sections to extract + + * parameters for the RegEx parser: + + * contentregex + Regular expression for content parsing + * titleregex (optional) + Regular expression for title parsing * We collect some XPath/CSS snippets at this place: Snippet collection - please feel free to add your own definitions! diff --git a/config_template.py b/config_template.py old mode 100644 new mode 100755 index f394e52..02788bd --- a/config_template.py +++ b/config_template.py @@ -1,47 +1,49 @@ -import os.path +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- -# Copyright: (2013-2014) Michael Till Beck +# Copyright: (2013-2017) Michael Till Beck # License: GPL-2.0+ -#We collect xpath snippets at this place: Snippet collection - please feel free to add your own definitions! + +# We collect xpath snippets at this place: +# Snippet collection +# Feel free to contribute! + + +from mwctools import URLReceiver as uri +from mwctools import CommandReceiver as command +from mwctools import XPathParser as xpath +from mwctools import CSSParser as css +from mwctools import RegExParser as regex +from mwctools import Content +from mwctools import Parser + sites = [ - {'shortname': 'mywebsite1', - 'uri': 'http://www.mywebsite1.com/info', - 'type': 'html', - 'titlexpath': '//h1', - 'contentxpath': '//div', - 'titleregex': '', - 'contentregex': '', - 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0', - 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' - 'encoding': 'utf-8'}, - - {'shortname': 'mywebsite2', - 'uri': 'http://www.mywebsite2.com/info', - 'type': 'html', - 'contentxpath': '//*[contains(concat(\' \', normalize-space(@class), \' \'), \' news-list-container \')]', - 'regex': '', - 'encoding': 'utf-8'}, - - {'shortname': 'mywebsite3', - 'uri': 'http://www.mywebsite3.com/info', - 'type': 'text', - 'contentxpath': '', - 'contentregex': 'Version\"\:\d*\.\d*', - 'encoding': 'utf-8'}, - - {'shortname': 'lscmd', - 'uri': 'cmd://ls -l /home/pi', - 'contentregex': '.*Desktop.*' - } + {'name': 'example-css', + 'parsers': [uri(uri='https://github.com/mtill', contenttype='html'), + css(contentcss='div') + ] + }, + + {'name': 'example-xpath', + 'parsers': [uri(uri='https://example-webpage.com/test', contenttype='html'), + xpath(contentxpath='//div[contains(concat(\' \', normalize-space(@class), \' \'), \' package-version-header \')]') + ] + }, + + {'name': 'my-script', + 'parsers': [command(command='/home/user/script.sh', contenttype='text'), + regex(contentregex='^.*$') + ] + } ] -subjectPostfix = 'A website has been updated!' +workingDirectory = '/path-to-data-dir/MailWebsiteChanges-data' -enableMailNotifications = True +enableMailNotifications = False maxMailsPerSession = -1 sender = 'me@mymail.com' smtphost = 'mysmtpprovider.com' @@ -51,9 +53,7 @@ smtpusername = sender smtppwd = 'mypassword' receiver = 'me2@mymail.com' -os.chdir('/var/cache/mwc') - -enableRSSFeed = True +enableRSSFeed = False rssfile = 'feed.xml' maxFeeds = 100 diff --git a/mwc.py b/mwc.py index 4df4799..6a48317 100755 --- a/mwc.py +++ b/mwc.py @@ -1,15 +1,11 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright: (2013-2015) Michael Till Beck +# Copyright: (2013-2017) Michael Till Beck # License: GPL-2.0+ -import urllib.request, urllib.error, urllib.parse -import urllib.parse -from lxml import etree -from cssselect import GenericTranslator -import re import io +from lxml import etree import hashlib import smtplib @@ -22,8 +18,6 @@ import sys import getopt import traceback -import subprocess - import time from time import strftime import random @@ -32,164 +26,20 @@ import importlib config = None defaultEncoding = 'utf-8' -maxTitleLength = 150 # this is how an empty RSS feed looks like emptyfeed = """ MailWebsiteChanges Feed - https://github.com/Debianguru/MailWebsiteChanges + https://github.com/mtill/MailWebsiteChanges MailWebsiteChanges Feed """ -# Attributes in HTML files storing URI values. These values are automatically translated to absolute URIs. -uriAttributes = [['//img[@src]', 'src'], ['//a[@href]', 'href']] -cmdscheme = 'cmd://' - mailsession = None -# translates all relative URIs found in trees to absolute URIs -def toAbsoluteURIs(trees, baseuri): - for tree in trees: - if isinstance(tree, str): - continue - for uriAttribute in uriAttributes: - tags = tree.xpath(uriAttribute[0]) - for tag in tags: - if tag.attrib.get(uriAttribute[1]) != None: - if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '': - tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]]) - - -def parseSite(site): - global defaultEncoding - file, content, titles, warning = None, None, None, None - - uri = site['uri'] - contenttype = site.get('type', 'html') - contentregex = site.get('contentregex', '') - titleregex = site.get('titleregex', '') - splitregex = site.get('splitregex', '') - enc = site.get('encoding', defaultEncoding) - - contentxpath = site.get('contentxpath', '') - if contentxpath == '' and site.get('contentcss', '') != '': - # CSS - contentxpath = GenericTranslator().css_to_xpath(site.get('contentcss')) - titlexpath = site.get('titlexpath', '') - if titlexpath == '' and site.get('titlecss', '') != '': - titlexpath = GenericTranslator().css_to_xpath(site.get('titlecss')) - - try: - - if uri.startswith(cmdscheme): - # run command and retrieve output - process = subprocess.Popen(uri[len(cmdscheme):], stdout=subprocess.PIPE, shell=True, close_fds=True) - file = process.stdout - else: - # open website - req = urllib.request.Request(uri) - if 'user-agent' in site: - req.add_header('User-Agent', site['user-agent']) - if 'accept' in site: - req.add_header('Accept', site['accept']) - file = urllib.request.urlopen(req) - - - if contenttype == 'text' or (contentxpath == '' and titlexpath == ''): - thefullcontent = file.read().decode(enc, errors='ignore') - contents = [thefullcontent] - if splitregex != '': - contents = thefullcontent.split(splitregex) - titles = [] - else: - baseuri = uri - if contenttype == 'html': - parser = etree.HTMLParser(encoding=enc) - else: - parser = etree.XMLParser(recover=True, encoding=enc) - - tree = etree.parse(file, parser) - - # xpath - contentresult = tree.xpath(contentxpath) if contentxpath else [] - titleresult = tree.xpath(titlexpath) if titlexpath else [] - - # translate relative URIs to absolute URIs - if contenttype == 'html': - basetaglist = tree.xpath('/html/head/base') - if len(basetaglist) != 0: - baseuri = basetaglist[0].attrib['href'] - if len(contentresult) != 0: - toAbsoluteURIs(contentresult, baseuri) - if len(titleresult) != 0: - toAbsoluteURIs(titleresult, baseuri) - - if contentxpath != '' and titlexpath != '' and len(contentresult) != len(titleresult): - warning = 'WARNING: number of title blocks (' + str(len(titleresult)) + ') does not match number of content blocks (' + str(len(contentresult)) + ')' - elif contentxpath and len(contentresult) == 0: - warning = 'WARNING: content selector became invalid!' - elif titlexpath and len(titleresult) == 0: - warning = 'WARNING: title selector became invalid!' - else: - if len(contentresult) == 0: - contentresult = titleresult - if len(titleresult) == 0: - titleresult = contentresult - - if isinstance(contentresult, str): - contents = [contentresult] - else: - contents = [etree.tostring(s, encoding=enc, pretty_print=True).decode(enc, errors='ignore') for s in contentresult] - if isinstance(titleresult, str): - titles = [getSubject(titleresult)] - else: - titles = [getSubject(etree.tostring(s, method='text', encoding=enc).decode(enc, errors='ignore')) for s in titleresult] - - except IOError as e: - warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e) - - if file is not None: - file.close() - - if uri.startswith(cmdscheme) and process.wait() != 0: - warning = 'WARNING: process terminated with an error' - - if warning: - return {'content': content, 'titles': titles, 'warning': warning} - - # parse regex - if contentregex: - contents = [x for y in [re.findall(r'' + contentregex, c, re.S) for c in contents] for x in y] - if titleregex: - titles = [x for y in [re.findall(r'' + titleregex, c, re.S) for c in titles] for x in y] - - if contentregex and titleregex and len(contents) != len(titles): - warning = 'WARNING: number of title blocks (' + str(len(titles)) + ') does not match number of content blocks (' + str(len(contents)) + ') after regex' - elif contentregex and len(contents) == 0: - warning = 'WARNING: content regex became invalid!' - elif titleregex and len(titles) == 0: - warning = 'WARNING: title regex became invalid!' - else: - if len(contents) == 0: - contents = titles - if len(titles) == 0: - titles = [getSubject(c) for c in contents] - - return {'contents': contents, 'titles': titles, 'warning': warning} - - -# returns a short subject line -def getSubject(textContent): - if textContent == None or textContent == '': - return config.subjectPostfix - textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip() - return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent - - # generates a new RSS feed item def genFeedItem(subject, content, link, change): feeditem = etree.Element('item') @@ -213,23 +63,26 @@ def genFeedItem(subject, content, link, change): # sends mail notification -def sendmail(receiver, subject, content, sendAsHtml, link): +def sendmail(receiver, subject, content, sendAsHtml, link, encoding=None): global mailsession, defaultEncoding + if encoding is None: + encoding = defaultEncoding + if sendAsHtml: baseurl = None - if link != None: + if link is not None: content = '

' + subject + '

\n' + content baseurl = urljoin(link, '/') - mail = MIMEText('' + subject + '' + ('' if baseurl else '') + '' + content + '', 'html', defaultEncoding) + mail = MIMEText('' + subject + '' + ('' if baseurl else '') + '' + content + '', 'html', encoding) else: - if link != None: + if link is not None: content = link + '\n\n' + content - mail = MIMEText(content, 'text', defaultEncoding) + mail = MIMEText(content, 'text', encoding) mail['From'] = config.sender mail['To'] = receiver - mail['Subject'] = Header(subject, defaultEncoding) + mail['Subject'] = Header(subject, encoding) # initialize session once, not each time this method gets called if mailsession is None: @@ -244,103 +97,114 @@ def sendmail(receiver, subject, content, sendAsHtml, link): # returns a list of all content that is stored locally for a specific site -def getStoredHashes(shortname): - +def getStoredHashes(name): result = [] - filename = shortname + ".txt" + filename = os.path.join(config.workingDirectory, name + ".txt") if os.path.exists(filename): - with open(filename, 'r') as file: - for line in file: + with open(filename, 'r') as thefile: + for line in thefile: result.append(line.rstrip()) return result # updates list of content that is stored locally for a specific site -def storeHashes(shortname, contentHashes): - - with open(shortname + '.txt', 'w') as file: +def storeHashes(name, contentHashes): + with open(os.path.join(config.workingDirectory, name + '.txt'), 'w') as thefile: for h in contentHashes: - file.write(h + "\n") + thefile.write(h + "\n") + + +def runParsers(parsers, contentList=None): + if contentList is None: + contentList = [] + + for parser in parsers: + contentList = parser.performAction(contentList) + + return contentList def pollWebsites(): global defaultEncoding # parse existing feed or create a new one + rssfile = config.rssfile + if not os.path.isabs(rssfile): + rssfile = os.path.join(config.workingDirectory, rssfile) + if config.enableRSSFeed: - if os.path.isfile(config.rssfile): - feedXML = etree.parse(config.rssfile) + if os.path.isfile(rssfile): + feedXML = etree.parse(rssfile) else: feedXML = etree.parse(io.StringIO(emptyfeed)) # start polling sites mailsSent = 0 for site in config.sites: - print('polling site [' + site['shortname'] + '] ...') - sessionHashes = [] - parseResult = parseSite(site) + print('polling site [' + site['name'] + '] ...') receiver = site.get('receiver', config.receiver) - # if something went wrong, notify the user - if parseResult['warning']: - subject = '[' + site['shortname'] + '] WARNING' - print('WARNING: ' + parseResult['warning']) + try: + contentList = runParsers(site['parsers']) + except Exception as e: + # if something went wrong, notify the user + subject = '[' + site['name'] + '] WARNING' + print('WARNING: ' + str(e)) if config.enableMailNotifications: if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession: - sendmail(receiver, subject, parseResult['warning'], False, None) + sendmail(receiver=receiver, subject=subject, content=str(e), sendAsHtml=False, link=None) mailsSent = mailsSent + 1 if config.enableRSSFeed: - feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0)) - else: - # otherwise, check which parts of the site were updated - changes = 0 - fileHashes = getStoredHashes(site['shortname']) - i = 0 - for content in parseResult['contents']: - - contenthash = hashlib.md5(content.encode(defaultEncoding)).hexdigest() - if contenthash not in fileHashes: - if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession: - changes += 1 - sessionHashes.append(contenthash) - - subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i] - print(' ' + subject) - if config.enableMailNotifications and len(fileHashes) > 0: - sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri']) - mailsSent = mailsSent + 1 - - if config.enableRSSFeed: - feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes)) - else: + feedXML.xpath('//channel')[0].append(genFeedItem(subject, str(e), "", 0)) + continue + + sessionHashes = [] + changedContents = [] + fileHashes = getStoredHashes(site['name']) + for content in contentList: + + contenthash = hashlib.md5(content.content.encode(content.encoding)).hexdigest() + if contenthash not in fileHashes: + if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession: sessionHashes.append(contenthash) + changedContents.append(content) - i += 1 + subject = '[' + site['name'] + '] ' + content.title + print(' ' + subject) + if config.enableMailNotifications and len(fileHashes) > 0: + sendAsHtml = (content.contenttype == 'html') + sendmail(receiver=receiver, subject=subject, content=content.content, sendAsHtml=sendAsHtml, link=content.uri, encoding=content.encoding) + mailsSent = mailsSent + 1 + if config.enableRSSFeed: + feedXML.xpath('//channel')[0].append(genFeedItem(subject, content.content, content.uri, len(changedContents))) + else: + sessionHashes.append(contenthash) + + if 'postRun' in site: + runParsers(site['postRun'], changedContents) - if changes > 0: - storeHashes(site['shortname'], sessionHashes) - print(' ' + str(changes) + ' updates') + if len(changedContents) > 0: + storeHashes(site['name'], sessionHashes) + print(' ' + str(len(changedContents)) + ' updates') # store feed if config.enableRSSFeed: for o in feedXML.xpath('//channel/item[position() +# Copyright: (2013-2017) Michael Till Beck # License: GPL-2.0+ + import http.server import socketserver import importlib import sys import getopt + bind = 'localhost' port = 8000 configMod = 'config' try: - opts, args = getopt.getopt(sys.argv[1:], 'hc:b:p:', ['help', 'config=', 'bind=', 'port=']) + opts, args = getopt.getopt(sys.argv[1:], 'hc:b:p:', ['help', 'config=', 'bind=', 'port=']) except getopt.GetoptError: - print('Usage: FeedServer.py --config=config --port=8000') - sys.exit(1) + print('Usage: FeedServer.py --config=config --port=8000 --bind=localhost') + sys.exit(1) for opt, arg in opts: - if opt == '-h': - print('Usage: FeedServer.py --config=config --bind=localhost --port=8000') - exit() - elif opt in ('-c', '--config'): - configMod = arg - elif opt in ('-b', '--bind'): - bind = arg - elif opt in ('-p', '--port'): - port = int(arg) + if opt == '-h': + print('Usage: FeedServer.py --config=config --bind=localhost --port=8000') + exit() + elif opt in ('-c', '--config'): + configMod = arg + elif opt in ('-b', '--bind'): + bind = arg + elif opt in ('-p', '--port'): + port = int(arg) config = importlib.import_module(configMod) diff --git a/mwctools.py b/mwctools.py new file mode 100755 index 0000000..cefbbf0 --- /dev/null +++ b/mwctools.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright: (2013-2017) Michael Till Beck +# License: GPL-2.0+ + + +import urllib.request +import urllib.error +import urllib.parse +import subprocess + +from lxml import etree +from cssselect import GenericTranslator +import re + + +# Attributes in HTML files storing URI values. These values are automatically translated to absolute URIs. +uriAttributes = [['//img[@src]', 'src'], ['//a[@href]', 'href']] + +maxTitleLength = 150 + + +class Parser: + # input: [Content], output: [Content] + def performAction(self, contentList): + pass + + +class Receiver(Parser): + def __init__(self, uri): + self.uri = uri + + +class Content: + def __init__(self, uri, encoding, title, content, contenttype): + self.uri = uri + self.encoding = encoding + self.title = title + self.content = content + self.contenttype = contenttype + + +# returns a short subject line +def getSubject(textContent): + global maxTitleLength + + if textContent is None or len(textContent.strip()) == 0: + return 'Website has been updated' + textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip() + return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent + + +# translates all relative URIs found in trees to absolute URIs +def toAbsoluteURIs(trees, baseuri): + global uriAttributes + + for tree in trees: + if isinstance(tree, str): + continue + for uriAttribute in uriAttributes: + tags = tree.xpath(uriAttribute[0]) + for tag in tags: + if tag.attrib.get(uriAttribute[1]) is not None: + if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '': + tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]]) + + +class URLReceiver(Receiver): + def __init__(self, uri, contenttype='html', encoding='utf-8', userAgent=None, accept=None): + super().__init__(uri) + self.contenttype = contenttype + self.encoding = encoding + self.userAgent = userAgent + self.accept = accept + + # input: [Content], output: [Content] + def performAction(self, contentList=None): + if contentList is None: + contentList = [] + + # open website + req = urllib.request.Request(self.uri) + if self.userAgent is not None: + req.add_header('User-Agent', self.userAgent) + if self.accept is not None: + req.add_header('Accept', self.accept) + + with urllib.request.urlopen(req) as thefile: + filecontent = thefile.read().decode(self.encoding, errors='ignore') + contentList.append(Content(uri=self.uri, encoding=self.encoding, title=None, content=filecontent, contenttype=self.contenttype)) + + return contentList + + +class CommandReceiver(Receiver): + def __init__(self, command, contenttype='text', encoding='utf-8'): + super().__init__(command) + self.encoding = encoding + self.command = command + self.contenttype = contenttype + + # input: [Content], output: [Content] + def performAction(self, contentList=None): + if contentList is None: + contentList = [] + + # run command and retrieve output + process = subprocess.Popen(self.command, stdout=subprocess.PIPE, shell=True, close_fds=True) + thefile = process.stdout + result = thefile.read().decode(self.encoding, errors='ignore') + thefile.close() + + if process.wait() != 0: + raise Exception("process terminated with an error") + + contentList.append(Content(uri=None, encoding=self.encoding, title=None, content=result, contenttype=self.contenttype)) + return contentList + + +class XPathParser(Parser): + def __init__(self, contentxpath, titlexpath=None): + self.contentxpath = contentxpath + self.titlexpath = titlexpath + + # input: [Content], output: [Content] + def performAction(self, contentList): + result = [] + for content in contentList: + result.extend(self.parseOneObject(content)) + return result + + # input: Content, output: [Content] + def parseOneObject(self, content): + baseuri = content.uri + if content.contenttype == 'html': + parser = etree.HTMLParser(encoding=content.encoding) + else: + parser = etree.XMLParser(recover=True, encoding=content.encoding) + + tree = etree.fromstring(content.content, parser=parser) + + # xpath + contentresult = [] if self.contentxpath is None else tree.xpath(self.contentxpath) + titleresult = [] if self.titlexpath is None else tree.xpath(self.titlexpath) + + # translate relative URIs to absolute URIs + if content.contenttype == 'html': + basetaglist = tree.xpath('/html/head/base') + if len(basetaglist) != 0: + baseuri = basetaglist[0].attrib['href'] + if len(contentresult) != 0: + toAbsoluteURIs(contentresult, baseuri) + if len(titleresult) != 0: + toAbsoluteURIs(titleresult, baseuri) + + if self.contentxpath and len(contentresult) == 0: + raise Exception('WARNING: content selector became invalid!') + if self.titlexpath and len(titleresult) == 0: + raise Exception('WARNING: title selector became invalid!') + + contents = [] + titles = [] + if isinstance(contentresult, str): + contents = [contentresult] + else: + if len(contentresult) == 0: + contentresult = titleresult + contents = [etree.tostring(s, encoding=content.encoding, pretty_print=True).decode(content.encoding, errors='ignore') for s in contentresult] + + if isinstance(titleresult, str): + titles = [getSubject(titleresult)]*len(contents) + else: + if len(titleresult) == 0 or len(titleresult) != len(contentresult): + titleresult = contentresult + titles = [getSubject(etree.tostring(s, method='text', encoding=content.encoding).decode(content.encoding, errors='ignore')) for s in titleresult] + + result = [] + for i in range(0, len(contents)): + result.append(Content(uri=content.uri, encoding=content.encoding, title=titles[i], content=contents[i], contenttype=content.contenttype)) + + return result + + +class CSSParser(Parser): + def __init__(self, contentcss, titlecss=None): + contentxpath = GenericTranslator().css_to_xpath(contentcss) + titlexpath = None + if titlecss is not None: + titlexpath = GenericTranslator().css_to_xpath(titlecss) + + self.xpathparser = XPathParser(contentxpath=contentxpath, titlexpath=titlexpath) + + # input: [Content], output: [Content] + def performAction(self, contentList): + return self.xpathparser.performAction(contentList) + + +class RegExParser(Parser): + def __init__(self, contentregex, titleregex=None): + self.contentregex = contentregex + self.titleregex = titleregex + + # input: [Content], output: [Content] + def performAction(self, contentList): + result = [] + for content in contentList: + result.extend(self.parseOneObject(content)) + return result + + # input: Content, output: [Content] + def parseOneObject(self, content): + contents = [] + titles = [] + if self.contentregex is not None: + for c in re.findall(r'' + self.contentregex, content.content, re.M): + if len(c.strip()) != 0: + contents.append(c) + if self.titleregex is not None: + for c in re.findall(r'' + self.titleregex, content.title, re.M): + if len(c.strip()) != 0: + titles.append(c) + + if self.contentregex is not None and len(contents) == 0: + raise Exception('WARNING: content regex became invalid!') + elif self.titleregex is not None and len(titles) == 0: + raise Exception('WARNING: title regex became invalid!') + else: + if len(contents) == 0: + contents = titles + if len(titles) == 0 or len(titles) != len(contents): + titles = [getSubject(c) for c in contents] + + result = [] + for i in range(0, len(contents)): + result.append(Content(uri=content.uri, encoding=content.encoding, title=titles[i], content=contents[i], contenttype=content.contenttype)) + + return result + -- cgit v1.2.3 From 85ce8edb9d5b92df39de1a22002bf29d779eeac2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Sun, 1 Oct 2017 19:25:58 +0200 Subject: Remove patches --- debian/changelog | 7 ++----- debian/patches/0100-config.diff | 24 ------------------------ debian/patches/series | 6 +++--- 3 files changed, 5 insertions(+), 32 deletions(-) diff --git a/debian/changelog b/debian/changelog index 63449fb..8335f2d 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,10 +1,7 @@ -mwc (1.8.2-1) UNRELEASED; urgency=medium +mwc (2.0.4-1) UNRELEASED; urgency=medium * New upstream release (Closes: #862004). - + Refresh patches. - * Renumbering patches. - * debian/patches/0105-try_mail.diff: - - Replace undefined printf with print (Closes: #860494). + + Remove now unuseable patches. * Rewrite debian/watch for archives without "v" in front of the version. * Declare compliance with Debian Policy 4.0.0. (No changes needed). * Bump compatlevel to 10 (no changes required): diff --git a/debian/patches/0100-config.diff b/debian/patches/0100-config.diff index 8529874..49bb589 100644 --- a/debian/patches/0100-config.diff +++ b/debian/patches/0100-config.diff @@ -5,30 +5,6 @@ Author: Jörg Frings-Fürst Last-Update: 2014-05-12 --- This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ -Index: trunk/config_template.py -=================================================================== ---- trunk.orig/config_template.py -+++ trunk/config_template.py -@@ -1,5 +1,3 @@ --import os.path -- - # Copyright: (2013-2014) Michael Till Beck - # License: GPL-2.0+ - -@@ -47,11 +45,11 @@ sender = 'me@mymail.com' - smtphost = 'mysmtpprovider.com' - useTLS = True - smtpport = 587 --smtpusername = sender -+smtpusername = 'sender' - smtppwd = 'mypassword' - receiver = 'me2@mymail.com' - --os.chdir('/var/cache/mwc') -+datadir'/var/cache/mwc' - - enableRSSFeed = True - rssfile = 'feed.xml' Index: trunk/mwcfeedserver.py =================================================================== --- trunk.orig/mwcfeedserver.py diff --git a/debian/patches/series b/debian/patches/series index a06ba62..3ad4804 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -1,4 +1,4 @@ -0100-config.diff -0105-try_mail.diff -0110-syslog.diff +#0100-config.diff +#0105-try_mail.diff +#0110-syslog.diff #0115-Add_header.patch -- cgit v1.2.3 From 9a99fe51fb073a85169d4e50ca68cb0ec4531c8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Sun, 1 Oct 2017 19:26:45 +0200 Subject: Declare compliance with Debian Policy 4.1.1 --- debian/changelog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debian/changelog b/debian/changelog index 8335f2d..13fc5f0 100644 --- a/debian/changelog +++ b/debian/changelog @@ -3,7 +3,7 @@ mwc (2.0.4-1) UNRELEASED; urgency=medium * New upstream release (Closes: #862004). + Remove now unuseable patches. * Rewrite debian/watch for archives without "v" in front of the version. - * Declare compliance with Debian Policy 4.0.0. (No changes needed). + * Declare compliance with Debian Policy 4.1.1. (No changes needed). * Bump compatlevel to 10 (no changes required): - Change debian/compat to 10. - At debian/control change requested version of debhelper to >= 10. -- cgit v1.2.3 From 9a955c414e34de441b5f188520314d54e3c5b3c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Sun, 1 Oct 2017 20:08:51 +0200 Subject: remove patches; rewrite d/changelog --- .gitignore | 2 +- debian/changelog | 6 ++- debian/control | 2 +- debian/patches/0100-config.diff | 88 ------------------------------------ debian/patches/0105-try_mail.diff | 52 --------------------- debian/patches/0110-syslog.diff | 64 -------------------------- debian/patches/0115-Add_header.patch | 50 -------------------- debian/patches/series | 4 -- 8 files changed, 6 insertions(+), 262 deletions(-) delete mode 100644 debian/patches/0100-config.diff delete mode 100644 debian/patches/0105-try_mail.diff delete mode 100644 debian/patches/0110-syslog.diff delete mode 100644 debian/patches/0115-Add_header.patch diff --git a/.gitignore b/.gitignore index 4ccd411..91ef9c0 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,4 @@ /*.txt *~ *.pyc - +.pc diff --git a/debian/changelog b/debian/changelog index 13fc5f0..ad84848 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,7 +1,9 @@ mwc (2.0.4-1) UNRELEASED; urgency=medium - * New upstream release (Closes: #862004). - + Remove now unuseable patches. + * New upstream release: + - Remove now unusable patches and therefore the printf function + (Closes: #860494). + - Add more Parameter needed at some websites (Closes: #862004). * Rewrite debian/watch for archives without "v" in front of the version. * Declare compliance with Debian Policy 4.1.1. (No changes needed). * Bump compatlevel to 10 (no changes required): diff --git a/debian/control b/debian/control index f38df7d..792ce2a 100644 --- a/debian/control +++ b/debian/control @@ -6,7 +6,7 @@ Build-Depends: debhelper (>= 10), dh-python, python3-all -Standards-Version: 4.0.0 +Standards-Version: 4.1.1 Homepage: https://github.com/Debianguru/MailWebsiteChanges Vcs-Git: git://anonscm.debian.org/collab-maint/mwc.git Vcs-Browser: https://anonscm.debian.org/cgit/collab-maint/mwc.git diff --git a/debian/patches/0100-config.diff b/debian/patches/0100-config.diff deleted file mode 100644 index 49bb589..0000000 --- a/debian/patches/0100-config.diff +++ /dev/null @@ -1,88 +0,0 @@ -Description: add loading config from every path - Add loading config from ervery path - Separation data / program -Author: Jörg Frings-Fürst -Last-Update: 2014-05-12 ---- -This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ -Index: trunk/mwcfeedserver.py -=================================================================== ---- trunk.orig/mwcfeedserver.py -+++ trunk/mwcfeedserver.py -@@ -6,6 +6,7 @@ - import http.server - import socketserver - import importlib -+import os - import sys - import getopt - -@@ -31,13 +32,26 @@ for opt, arg in opts: - elif opt in ('-p', '--port'): - port = int(arg) - --config = importlib.import_module(configMod) -- -+# -+# add code to load config from nonsystem path -+# and change to datadir -+# -+try: -+ path = os.path.dirname(configMod) -+ fullname = os.path.basename(configMod) -+ sys.path.append(path) -+ config = importlib.import_module(fullname) -+except: -+ print('Error: loading config') -+ sys.exit(2) - - handler = http.server.SimpleHTTPRequestHandler - - httpd = socketserver.TCPServer((bind, port), handler) - - print('Bond to ' + bind + ', listening on port ' + str(port)) --httpd.serve_forever() -- -+try: -+ httpd.serve_forever() -+except KeyboardInterrupt: -+ pass -+httpd.server_close() -Index: trunk/mwc.py -=================================================================== ---- trunk.orig/mwc.py -+++ trunk/mwc.py -@@ -334,7 +334,7 @@ def pollWebsites(): - - if __name__ == "__main__": - -- configMod = 'config' -+ configMod = '/etc/mwc/mwc-config' - dryrun = None - - try: -@@ -351,7 +351,23 @@ if __name__ == "__main__": - elif opt in ('-d', '--dry-run'): - dryrun = arg - -- config = importlib.import_module(configMod) -+ # -+ # add code to load config from nonsystem path -+ # and change to datadir -+ # -+ try: -+ path = os.path.dirname(configMod) -+ fullname = os.path.basename(configMod) -+ sys.path.append(path) -+ config = importlib.import_module(fullname) -+ except: -+ print('Error: loading config') -+ sys.exit(2) -+ try: -+ os.chdir(config.datadir) -+ except: -+ print('Error: datadir not found') -+ sys.exit(3) - - if dryrun: - for site in config.sites: diff --git a/debian/patches/0105-try_mail.diff b/debian/patches/0105-try_mail.diff deleted file mode 100644 index bc62ef1..0000000 --- a/debian/patches/0105-try_mail.diff +++ /dev/null @@ -1,52 +0,0 @@ -Description: try / except around mail functions - add try / except around mail functions to - prevent python errors messages -Author: Jörg Frings-Fürst -Forwarded: via mail -Applied-Upstream: -Reviewed-by: -Last-Update: 2014-05-22 ---- -This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ -Index: trunk/mwc.py -=================================================================== ---- trunk.orig/mwc.py -+++ trunk/mwc.py -@@ -232,16 +232,28 @@ def sendmail(receiver, subject, content, - mail['Subject'] = Header(subject, defaultEncoding) - - # initialize session once, not each time this method gets called -- if mailsession is None: -- mailsession = smtplib.SMTP(config.smtphost, config.smtpport) -- if config.useTLS: -- mailsession.ehlo() -- mailsession.starttls() -- if config.smtpusername is not None: -- mailsession.login(config.smtpusername, config.smtppwd) -- -- mailsession.sendmail(config.sender, receiver.split(','), mail.as_string()) - -+ # -+ # add try / except to open mailsession -+ # -+ try: -+ if mailsession is None: -+ mailsession = smtplib.SMTP(config.smtphost, config.smtpport) -+ if config.useTLS: -+ mailsession.ehlo() -+ mailsession.starttls() -+ mailsession.login(config.smtpusername, config.smtppwd) -+ # -+ # add try / except to send mail -+ # -+ except: -+ print('Error: Open smtp-session') -+ exit(4) -+ try: -+ mailsession.sendmail(config.sender, receiver.split(','), mail.as_string()) -+ except: -+ print('Error: sendmail') -+ exit(5) - - # returns a list of all content that is stored locally for a specific site - def getStoredHashes(shortname): diff --git a/debian/patches/0110-syslog.diff b/debian/patches/0110-syslog.diff deleted file mode 100644 index bd61d81..0000000 --- a/debian/patches/0110-syslog.diff +++ /dev/null @@ -1,64 +0,0 @@ -Description: add syslog messages on errors -Author: Jörg Frings-Fürst -Forwarded: via mail -Last-Update: 2014-05-22 ---- -This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ -Index: trunk/mwc.py -=================================================================== ---- trunk.orig/mwc.py -+++ trunk/mwc.py -@@ -21,6 +21,7 @@ import os - import sys - import getopt - import traceback -+import syslog - - import subprocess - -@@ -248,11 +249,13 @@ def sendmail(receiver, subject, content, - # - except: - print('Error: Open smtp-session') -+ syslog.syslog(syslog.LOG_ERR, 'can not open smtp session') - exit(4) - try: - mailsession.sendmail(config.sender, receiver.split(','), mail.as_string()) - except: - print('Error: sendmail') -+ syslog.syslog(syslog.LOG_ERR, 'error on sendmail') - exit(5) - - # returns a list of all content that is stored locally for a specific site -@@ -349,6 +352,11 @@ if __name__ == "__main__": - configMod = '/etc/mwc/mwc-config' - dryrun = None - -+ # -+ # add syslog open -+ # -+ syslog.openlog() -+ - try: - opts, args = getopt.getopt(sys.argv[1:], 'hc:d:', ['help', 'config=', 'dry-run=']) - except getopt.GetoptError: -@@ -374,11 +382,13 @@ if __name__ == "__main__": - config = importlib.import_module(fullname) - except: - print('Error: loading config') -+ syslog.syslog(syslog.LOG_ERR, 'can not found / load mwc-config') - sys.exit(2) - try: - os.chdir(config.datadir) - except: - print('Error: datadir not found') -+ syslog.syslog(syslog.LOG_ERR, 'datadir not found') - sys.exit(3) - - if dryrun: -@@ -400,3 +410,5 @@ if __name__ == "__main__": - if mailsession: - mailsession.quit() - mailsession = None -+ -+ syslog.closelog() diff --git a/debian/patches/0115-Add_header.patch b/debian/patches/0115-Add_header.patch deleted file mode 100644 index 6ce0c15..0000000 --- a/debian/patches/0115-Add_header.patch +++ /dev/null @@ -1,50 +0,0 @@ -Description: Add Header Accept -Author: Jörg Frings-Fürst -Bug-Debian: http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=862004 -Forwarded: https://github.com/Debianguru/MailWebsiteChanges/issues/11 -Last-Update: 2017-05-07 ---- -This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ -Index: trunk/mwc.py -=================================================================== ---- trunk.orig/mwc.py -+++ trunk/mwc.py -@@ -91,6 +91,8 @@ def parseSite(site): - req = urllib.request.Request(uri) - if 'user-agent' in site: - req.add_header('User-Agent', site['user-agent']) -+ if 'accept' in site: -+ req.add_header('Accept', site['accept']) - file = urllib.request.urlopen(req) - - -Index: trunk/README.md -=================================================================== ---- trunk.orig/README.md -+++ trunk/README.md -@@ -59,7 +59,9 @@ sites = [ - * user-agent (optional) - Defines the user agent string, e.g., - 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0' -- -+ * accept (optional) -+ Defines the accept string, e.g., -+ 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' - - * We collect some XPath/CSS snippets at this place: Snippet collection - please feel free to add your own definitions! - -Index: trunk/config_template.py -=================================================================== ---- trunk.orig/config_template.py -+++ trunk/config_template.py -@@ -12,7 +12,9 @@ sites = [ - 'contentxpath': '//div', - 'titleregex': '', - 'contentregex': '', -- 'encoding': 'utf-8'}, -+ 'encoding': 'utf-8', -+ 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0', -+ 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}, - - {'shortname': 'mywebsite2', - 'uri': 'http://www.mywebsite2.com/info', diff --git a/debian/patches/series b/debian/patches/series index 3ad4804..e69de29 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -1,4 +0,0 @@ -#0100-config.diff -#0105-try_mail.diff -#0110-syslog.diff -#0115-Add_header.patch -- cgit v1.2.3 From 613720c29e3613baf3eb56511eab13195f4e8790 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Sun, 1 Oct 2017 20:24:36 +0200 Subject: d/changelog: change release, date/time to upload --- debian/changelog | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debian/changelog b/debian/changelog index ad84848..716efc8 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -mwc (2.0.4-1) UNRELEASED; urgency=medium +mwc (2.0.4-1) unstable; urgency=medium * New upstream release: - Remove now unusable patches and therefore the printf function @@ -14,7 +14,7 @@ mwc (2.0.4-1) UNRELEASED; urgency=medium - Refresh copyright year at * and debian/*. * New README.source to explain the branching model used. - -- Jörg Frings-Fürst Sun, 06 Aug 2017 19:52:54 +0200 + -- Jörg Frings-Fürst Sun, 01 Oct 2017 20:21:11 +0200 mwc (1.7.2-3) unstable; urgency=medium -- cgit v1.2.3