summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJörg Frings-Fürst <debian@jff-webhosting.net>2017-10-02 06:09:15 +0200
committerJörg Frings-Fürst <debian@jff-webhosting.net>2017-10-02 06:09:15 +0200
commitef84a92a429b724586031878dfc0b38c64d55a23 (patch)
tree4fb270e0f1bdd261417fdc06cfb6bf3a58d17023
parent18449d7dd31123e14f8a0d87047f2f85187a156b (diff)
parent613720c29e3613baf3eb56511eab13195f4e8790 (diff)
Merge branch 'release/2.0.4-1'2.0.4-1
-rw-r--r--.bzrignore3
-rw-r--r--.gitignore2
-rw-r--r--README.md88
-rwxr-xr-x[-rw-r--r--]config_template.py75
-rw-r--r--debian/README.source18
-rw-r--r--debian/changelog15
-rw-r--r--debian/control2
-rw-r--r--debian/patches/0100-config.diff115
-rw-r--r--debian/patches/0105-try_mail.diff52
-rw-r--r--debian/patches/0110-syslog.diff96
-rw-r--r--debian/patches/0115-Add_header.patch50
-rw-r--r--debian/patches/series4
-rwxr-xr-xmwc.py483
-rwxr-xr-xmwcfeedserver.py31
-rwxr-xr-xmwctools.py239
15 files changed, 562 insertions, 711 deletions
diff --git a/.bzrignore b/.bzrignore
deleted file mode 100644
index 2386f62..0000000
--- a/.bzrignore
+++ /dev/null
@@ -1,3 +0,0 @@
-.git
-**/.git
-**/.pc
diff --git a/.gitignore b/.gitignore
index 1984fa7..91ef9c0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,4 @@
/*.txt
*~
*.pyc
-.bzrignore
-.bzr
.pc
diff --git a/README.md b/README.md
index d008527..69718e7 100644
--- a/README.md
+++ b/README.md
@@ -19,20 +19,23 @@ Some examples:
<code>
sites = [
- {'shortname': 'mywebsite1',
- 'uri': 'http://www.mywebsite1.com/info',
- 'contentcss': 'div'},
-
- {'shortname': 'mywebsite2',
- 'uri': 'http://www.mywebsite2.com/info',
- 'contentxpath': '//*[contains(concat(\' \', normalize-space(@class), \' \'), \' news-list-container \')]',
- 'titlexpath': '//title'},
-
- {'shortname': 'mywebsite3',
- 'uri': 'http://www.mywebsite3.com/info',
- 'type': 'text',
- 'contentregex': 'Version\"\:\d*\.\d*',
- 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'}
+ {'name': 'example-css',
+ 'parsers': [uri(uri='https://github.com/mtill', contenttype='html'),
+ css(contentcss='div')
+ ]
+ },
+
+ {'name': 'example-xpath',
+ 'parsers': [uri(uri='https://example-webpage.com/test', contenttype='html'),
+ xpath(contentxpath='//div[contains(concat(\' \', normalize-space(@class), \' \'), \' package-version-header \')]')
+ ]
+ },
+
+ {'name': 'my-script',
+ 'parsers': [command(command='/home/user/script.sh', contenttype='text'),
+ regex(contentregex='^.*$')
+ ]
+ }
]
</code>
@@ -40,29 +43,55 @@ sites = [
* parameters:
- * <b>shortname</b>
- short name of the entry, used as an identifier when sending email notifications
+ * <b>name</b>
+ name of the entry, used as an identifier when sending email notifications
+ * <b>receiver</b> (optional)
+ Overrides global receiver specification.
+
+ * parameters for the URL receiver:
+
* <b>uri</b>
- URI of the website; If the scheme of the uri is 'cmd://', the string is interpreted as a command and the standard output (stdout) is parsed.
- * <b>type</b> (optional; default: 'html')
+ URI of the website
+ * <b>contenttype</b> (optional; default: 'html')
content type, e.g., 'xml'/'html'/'text'.
- * <b>contentxpath</b> / <b>titlexpath</b> (optional)
- XPath expression for the content/title sections to extract. If you prefer, you could use contentcss/titlecss instead.
- * <b>contentcss</b> / <b>titlecss</b> (optional)
- CSS expression for the content/title sections to extract. This is ignored if there is a corresponding XPath definition.
- * <b>contentregex</b> / <b>titleregex</b> (optional)
- Regular expression. If XPath/CSS selector is defined, the regular expression is applied afterwards.
- * <b>encoding</b> (optional; default: 'utf-8')
+ * <b>enc</b> (optional; default: 'utf-8')
Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'.
- * <b>receiver</b> (optional)
- Overrides global receiver specification.
- * <b>user-agent</b> (optional)
+ * <b>userAgent</b> (optional)
Defines the user agent string, e.g.,
- 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'
+ 'userAgent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'
* <b>accept</b> (optional)
Defines the accept string, e.g.,
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
+ * parameters for the Command receiver
+
+ * <b>command</b>
+ the command
+ * <b>contenttype</b> (optional; default: 'text')
+ content type, e.g., 'xml'/'html'/'text'.
+ * <b>enc</b> (optional; default: 'utf-8')
+ Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'.
+
+ * parameters for the XPath parser:
+
+ * <b>contentxpath</b>
+ XPath expression for the content sections to extract
+ * <b>titlexpath</b> (optional)
+ XPath expression for the title sections to extract
+
+ * parameters for the CSS parser:
+
+ * <b>contentcss</b>
+ CSS expression for the content sections to extract
+ * <b>titlecss</b> (optional)
+ CSS expression for the title sections to extract
+
+ * parameters for the RegEx parser:
+
+ * <b>contentregex</b>
+ Regular expression for content parsing
+ * <b>titleregex</b> (optional)
+ Regular expression for title parsing
* We collect some XPath/CSS snippets at this place: <a href="https://github.com/Debianguru/MailWebsiteChanges/wiki/snippets">Snippet collection</a> - please feel free to add your own definitions!
@@ -79,6 +108,7 @@ sites = [
<pre>
<code>
enableMailNotifications = True #enable/disable notification messages; if set to False, only send error messages
+maxMailsPerSession = -1 #max. number of mails to send per session; ignored when set to -1
subjectPostfix = 'A website has been updated!'
sender = 'me@mymail.com'
diff --git a/config_template.py b/config_template.py
index 02f7579..02788bd 100644..100755
--- a/config_template.py
+++ b/config_template.py
@@ -1,47 +1,50 @@
-import os.path
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
-# Copyright: (2013-2014) Michael Till Beck <Debianguru@gmx.de>
+# Copyright: (2013-2017) Michael Till Beck <Debianguru@gmx.de>
# License: GPL-2.0+
-#We collect xpath snippets at this place: <a href="https://github.com/Debianguru/MailWebsiteChanges/wiki/snippets">Snippet collection</a> - please feel free to add your own definitions!
+
+# We collect xpath snippets at this place:
+# <a href="https://github.com/Debianguru/MailWebsiteChanges/wiki/snippets">Snippet collection</a>
+# Feel free to contribute!
+
+
+from mwctools import URLReceiver as uri
+from mwctools import CommandReceiver as command
+from mwctools import XPathParser as xpath
+from mwctools import CSSParser as css
+from mwctools import RegExParser as regex
+from mwctools import Content
+from mwctools import Parser
+
sites = [
- {'shortname': 'mywebsite1',
- 'uri': 'http://www.mywebsite1.com/info',
- 'type': 'html',
- 'titlexpath': '//h1',
- 'contentxpath': '//div',
- 'titleregex': '',
- 'contentregex': '',
- 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0',
- 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'encoding': 'utf-8'},
-
- {'shortname': 'mywebsite2',
- 'uri': 'http://www.mywebsite2.com/info',
- 'type': 'html',
- 'contentxpath': '//*[contains(concat(\' \', normalize-space(@class), \' \'), \' news-list-container \')]',
- 'regex': '',
- 'encoding': 'utf-8'},
-
- {'shortname': 'mywebsite3',
- 'uri': 'http://www.mywebsite3.com/info',
- 'type': 'text',
- 'contentxpath': '',
- 'contentregex': 'Version\"\:\d*\.\d*',
- 'encoding': 'utf-8'},
-
- {'shortname': 'lscmd',
- 'uri': 'cmd://ls -l /home/pi',
- 'contentregex': '.*Desktop.*'
- }
+ {'name': 'example-css',
+ 'parsers': [uri(uri='https://github.com/mtill', contenttype='html'),
+ css(contentcss='div')
+ ]
+ },
+
+ {'name': 'example-xpath',
+ 'parsers': [uri(uri='https://example-webpage.com/test', contenttype='html'),
+ xpath(contentxpath='//div[contains(concat(\' \', normalize-space(@class), \' \'), \' package-version-header \')]')
+ ]
+ },
+
+ {'name': 'my-script',
+ 'parsers': [command(command='/home/user/script.sh', contenttype='text'),
+ regex(contentregex='^.*$')
+ ]
+ }
]
-subjectPostfix = 'A website has been updated!'
+workingDirectory = '/path-to-data-dir/MailWebsiteChanges-data'
-enableMailNotifications = True
+enableMailNotifications = False
+maxMailsPerSession = -1
sender = 'me@mymail.com'
smtphost = 'mysmtpprovider.com'
useTLS = True
@@ -50,9 +53,7 @@ smtpusername = sender
smtppwd = 'mypassword'
receiver = 'me2@mymail.com'
-os.chdir('/var/cache/mwc')
-
-enableRSSFeed = True
+enableRSSFeed = False
rssfile = 'feed.xml'
maxFeeds = 100
diff --git a/debian/README.source b/debian/README.source
new file mode 100644
index 0000000..e4f2b3d
--- /dev/null
+++ b/debian/README.source
@@ -0,0 +1,18 @@
+Hello,
+
+now I use the branching model from Vincent Driessen[1].
+
+I use the gitflow-avh[2]. with the Documentation[3].
+The Debian package can be found here[4].
+
+Please upload unattended uploads use a branch feature/<your title>.
+
+
+Many thanks.
+
+ -- Jörg Frings-Fürst <debian@jff-webhosting.net> Fri, 02 Jun 2017 19:00:40 +0200
+
+[1] http://nvie.com/posts/a-successful-git-branching-model/
+[2] https://github.com/petervanderdoes/gitflow-avh
+[3] https://github.com/petervanderdoes/gitflow-avh/wiki
+[4] https://tracker.debian.org/pkg/git-flow
diff --git a/debian/changelog b/debian/changelog
index 48ee0e8..716efc8 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,19 +1,20 @@
-mwc (1.7.5-1) UNRELEASED; urgency=medium
+mwc (2.0.4-1) unstable; urgency=medium
- * New upstream release.
- * Renumbering patches.
- * debian/patches/0105-try_mail.diff:
- - Replace undefined printf with print (Closes: #860494).
+ * New upstream release:
+ - Remove now unusable patches and therefore the printf function
+ (Closes: #860494).
+ - Add more Parameter needed at some websites (Closes: #862004).
* Rewrite debian/watch for archives without "v" in front of the version.
- * Bump Standards-Version to 3.9.8.
+ * Declare compliance with Debian Policy 4.1.1. (No changes needed).
* Bump compatlevel to 10 (no changes required):
- Change debian/compat to 10.
- At debian/control change requested version of debhelper to >= 10.
* At debian/control change Vcs-Browser to secure URI.
* debian/copyright:
- Refresh copyright year at * and debian/*.
+ * New README.source to explain the branching model used.
- -- Jörg Frings-Fürst <debian@jff-webhosting.net> Tue, 18 Apr 2017 11:06:04 +0200
+ -- Jörg Frings-Fürst <debian@jff-webhosting.net> Sun, 01 Oct 2017 20:21:11 +0200
mwc (1.7.2-3) unstable; urgency=medium
diff --git a/debian/control b/debian/control
index 70dd2d3..792ce2a 100644
--- a/debian/control
+++ b/debian/control
@@ -6,7 +6,7 @@ Build-Depends:
debhelper (>= 10),
dh-python,
python3-all
-Standards-Version: 3.9.8
+Standards-Version: 4.1.1
Homepage: https://github.com/Debianguru/MailWebsiteChanges
Vcs-Git: git://anonscm.debian.org/collab-maint/mwc.git
Vcs-Browser: https://anonscm.debian.org/cgit/collab-maint/mwc.git
diff --git a/debian/patches/0100-config.diff b/debian/patches/0100-config.diff
deleted file mode 100644
index ce4dba7..0000000
--- a/debian/patches/0100-config.diff
+++ /dev/null
@@ -1,115 +0,0 @@
-Description: add loading config from every path
- Add loading config from ervery path
- Separation data / program
-Author: Jörg Frings-Fürst <jff@jff-webhosting.net>
-Last-Update: 2014-05-12
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
-Index: trunk/mwc.py
-===================================================================
---- trunk.orig/mwc.py
-+++ trunk/mwc.py
-@@ -319,7 +319,7 @@ def pollWebsites():
-
- if __name__ == "__main__":
-
-- configMod = 'config'
-+ configMod = '/etc/mwc/mwc-config'
- dryrun = None
-
- try:
-@@ -335,9 +335,24 @@ if __name__ == "__main__":
- configMod = arg
- elif opt in ('-d', '--dry-run'):
- dryrun = arg
--
-- config = importlib.import_module(configMod)
--
-+ #
-+ # add code to load config from nonsystem path
-+ # and change to datadir
-+ #
-+ try:
-+ path = os.path.dirname(configMod)
-+ fullname = os.path.basename(configMod)
-+ sys.path.append(path)
-+ config = importlib.import_module(fullname)
-+ except:
-+ print('Error: loading config')
-+ sys.exit(2)
-+ try:
-+ os.chdir(config.datadir)
-+ except:
-+ print('Error: datadir not found')
-+ sys.exit(3)
-+
- if dryrun:
- for site in config.sites:
- if site['shortname'] == dryrun:
-Index: trunk/config_template.py
-===================================================================
---- trunk.orig/config_template.py
-+++ trunk/config_template.py
-@@ -1,5 +1,3 @@
--import os.path
--
- # Copyright: (2013-2014) Michael Till Beck <Debianguru@gmx.de>
- # License: GPL-2.0+
-
-@@ -46,11 +44,11 @@ sender = 'me@mymail.com'
- smtphost = 'mysmtpprovider.com'
- useTLS = True
- smtpport = 587
--smtpusername = sender
-+smtpusername = 'sender'
- smtppwd = 'mypassword'
- receiver = 'me2@mymail.com'
-
--os.chdir('/var/cache/mwc')
-+datadir'/var/cache/mwc'
-
- enableRSSFeed = True
- rssfile = 'feed.xml'
-Index: trunk/mwcfeedserver.py
-===================================================================
---- trunk.orig/mwcfeedserver.py
-+++ trunk/mwcfeedserver.py
-@@ -6,6 +6,7 @@
- import http.server
- import socketserver
- import importlib
-+import os
- import sys
- import getopt
-
-@@ -31,13 +32,26 @@ for opt, arg in opts:
- elif opt in ('-p', '--port'):
- port = int(arg)
-
--config = importlib.import_module(configMod)
--
-+#
-+# add code to load config from nonsystem path
-+# and change to datadir
-+#
-+try:
-+ path = os.path.dirname(configMod)
-+ fullname = os.path.basename(configMod)
-+ sys.path.append(path)
-+ config = importlib.import_module(fullname)
-+except:
-+ print('Error: loading config')
-+ sys.exit(2)
-
- handler = http.server.SimpleHTTPRequestHandler
-
- httpd = socketserver.TCPServer((bind, port), handler)
-
- print('Bond to ' + bind + ', listening on port ' + str(port))
--httpd.serve_forever()
--
-+try:
-+ httpd.serve_forever()
-+except KeyboardInterrupt:
-+ pass
-+httpd.server_close()
diff --git a/debian/patches/0105-try_mail.diff b/debian/patches/0105-try_mail.diff
deleted file mode 100644
index d390b6d..0000000
--- a/debian/patches/0105-try_mail.diff
+++ /dev/null
@@ -1,52 +0,0 @@
-Description: try / except around mail functions
- add try / except around mail functions to
- prevent python errors messages
-Author: Jörg Frings-Fürst <debian@jff-webhosting.net>
-Forwarded: via mail
-Applied-Upstream: <version|URL|commit, identifies patches merged upstream, optional>
-Reviewed-by:
-Last-Update: 2014-05-22
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
-Index: trunk/mwc.py
-===================================================================
---- trunk.orig/mwc.py
-+++ trunk/mwc.py
-@@ -225,16 +225,27 @@ def sendmail(receiver, subject, content,
- mail['Subject'] = Header(subject, defaultEncoding)
-
- # initialize session once, not each time this method gets called
-- if mailsession is None:
-- mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
-- if config.useTLS:
-- mailsession.ehlo()
-- mailsession.starttls()
-- if config.smtpusername is not None:
-- mailsession.login(config.smtpusername, config.smtppwd)
--
-- mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
--
-+ #
-+ # add try / except to open mailsession
-+ #
-+ try:
-+ if mailsession is None:
-+ mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
-+ if config.useTLS:
-+ mailsession.ehlo()
-+ mailsession.starttls()
-+ mailsession.login(config.smtpusername, config.smtppwd)
-+ #
-+ # add try / except to send mail
-+ #
-+ except:
-+ print('Error: Open smtp-session')
-+ exit(4)
-+ try:
-+ mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
-+ except:
-+ print('Error: sendmail')
-+ exit(5)
-
- # returns a list of all content that is stored locally for a specific site
- def getFileContents(shortname):
diff --git a/debian/patches/0110-syslog.diff b/debian/patches/0110-syslog.diff
deleted file mode 100644
index 12d629d..0000000
--- a/debian/patches/0110-syslog.diff
+++ /dev/null
@@ -1,96 +0,0 @@
-Description: add syslog messages on errors
-Author: Jörg Frings-Fürst <debian@jffwebhosting.net>
-Forwarded: via mail
-Last-Update: 2014-05-22
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
-Index: trunk/mwc.py
-===================================================================
---- trunk.orig/mwc.py
-+++ trunk/mwc.py
-@@ -19,6 +19,7 @@ import os
- import sys
- import getopt
- import traceback
-+import syslog
-
- import subprocess
-
-@@ -227,25 +228,28 @@ def sendmail(receiver, subject, content,
- # initialize session once, not each time this method gets called
- #
- # add try / except to open mailsession
-- #
-+ #
-+
- try:
-- if mailsession is None:
-- mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
-- if config.useTLS:
-- mailsession.ehlo()
-- mailsession.starttls()
-- mailsession.login(config.smtpusername, config.smtppwd)
-- #
-+ if mailsession is None:
-+ mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
-+ if config.useTLS:
-+ mailsession.ehlo()
-+ mailsession.starttls()
-+ mailsession.login(config.smtpusername, config.smtppwd)
-+ except:
-+ print('Error: Open smtp-session')
-+ syslog.syslog(syslog.LOG_ERR, 'can not open smtp session')
-+ exit(4)
-+ #
- # add try / except to send mail
- #
-+ try:
-+ mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
- except:
-- print('Error: Open smtp-session')
-- exit(4)
-- try:
-- mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
-- except:
-- print('Error: sendmail')
-- exit(5)
-+ print('Error: sendmail')
-+ syslog.syslog(syslog.LOG_ERR, 'error on sendmail')
-+ exit(5)
-
- # returns a list of all content that is stored locally for a specific site
- def getFileContents(shortname):
-@@ -332,7 +336,11 @@ if __name__ == "__main__":
-
- configMod = '/etc/mwc/mwc-config'
- dryrun = None
--
-+
-+ #
-+ # add syslog open
-+ #
-+ syslog.openlog()
- try:
- opts, args = getopt.getopt(sys.argv[1:], 'hc:d:', ['help', 'config=', 'dry-run='])
- except getopt.GetoptError:
-@@ -357,11 +365,13 @@ if __name__ == "__main__":
- config = importlib.import_module(fullname)
- except:
- print('Error: loading config')
-+ syslog.syslog(syslog.LOG_ERR, 'can not found / load mwc-config')
- sys.exit(2)
- try:
- os.chdir(config.datadir)
- except:
- print('Error: datadir not found')
-+ syslog.syslog(syslog.LOG_ERR, 'datadir not found')
- sys.exit(3)
-
- if dryrun:
-@@ -383,3 +393,5 @@ if __name__ == "__main__":
- mailsession.quit()
- mailsession = None
-
-+ syslog.closelog()
-+
-\ No newline at end of file
diff --git a/debian/patches/0115-Add_header.patch b/debian/patches/0115-Add_header.patch
deleted file mode 100644
index 6ce0c15..0000000
--- a/debian/patches/0115-Add_header.patch
+++ /dev/null
@@ -1,50 +0,0 @@
-Description: Add Header Accept
-Author: Jörg Frings-Fürst <debian@jff-webhosting.net>
-Bug-Debian: http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=862004
-Forwarded: https://github.com/Debianguru/MailWebsiteChanges/issues/11
-Last-Update: 2017-05-07
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
-Index: trunk/mwc.py
-===================================================================
---- trunk.orig/mwc.py
-+++ trunk/mwc.py
-@@ -91,6 +91,8 @@ def parseSite(site):
- req = urllib.request.Request(uri)
- if 'user-agent' in site:
- req.add_header('User-Agent', site['user-agent'])
-+ if 'accept' in site:
-+ req.add_header('Accept', site['accept'])
- file = urllib.request.urlopen(req)
-
-
-Index: trunk/README.md
-===================================================================
---- trunk.orig/README.md
-+++ trunk/README.md
-@@ -59,7 +59,9 @@ sites = [
- * <b>user-agent</b> (optional)
- Defines the user agent string, e.g.,
- 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'
--
-+ * <b>accept</b> (optional)
-+ Defines the accept string, e.g.,
-+ 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
-
- * We collect some XPath/CSS snippets at this place: <a href="https://github.com/Debianguru/MailWebsiteChanges/wiki/snippets">Snippet collection</a> - please feel free to add your own definitions!
-
-Index: trunk/config_template.py
-===================================================================
---- trunk.orig/config_template.py
-+++ trunk/config_template.py
-@@ -12,7 +12,9 @@ sites = [
- 'contentxpath': '//div',
- 'titleregex': '',
- 'contentregex': '',
-- 'encoding': 'utf-8'},
-+ 'encoding': 'utf-8',
-+ 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0',
-+ 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'},
-
- {'shortname': 'mywebsite2',
- 'uri': 'http://www.mywebsite2.com/info',
diff --git a/debian/patches/series b/debian/patches/series
index a06ba62..e69de29 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,4 +0,0 @@
-0100-config.diff
-0105-try_mail.diff
-0110-syslog.diff
-#0115-Add_header.patch
diff --git a/mwc.py b/mwc.py
index a0635a1..6a48317 100755
--- a/mwc.py
+++ b/mwc.py
@@ -1,14 +1,12 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
-# Copyright: (2013-2015) Michael Till Beck <Debianguru@gmx.de>
+# Copyright: (2013-2017) Michael Till Beck <Debianguru@gmx.de>
# License: GPL-2.0+
-import urllib.request, urllib.error, urllib.parse
-import urllib.parse
-from lxml import etree
-from cssselect import GenericTranslator
-import re
import io
+from lxml import etree
+import hashlib
import smtplib
from email.mime.text import MIMEText
@@ -20,8 +18,6 @@ import sys
import getopt
import traceback
-import subprocess
-
import time
from time import strftime
import random
@@ -30,330 +26,215 @@ import importlib
config = None
defaultEncoding = 'utf-8'
-maxTitleLength = 150
# this is how an empty RSS feed looks like
emptyfeed = """<?xml version="1.0"?>
<rss version="2.0">
<channel>
<title>MailWebsiteChanges Feed</title>
- <link>https://github.com/Debianguru/MailWebsiteChanges</link>
+ <link>https://github.com/mtill/MailWebsiteChanges</link>
<description>MailWebsiteChanges Feed</description>
</channel>
</rss>"""
-# Attributes in HTML files storing URI values. These values are automatically translated to absolute URIs.
-uriAttributes = [['//img[@src]', 'src'], ['//a[@href]', 'href']]
-cmdscheme = 'cmd://'
-
mailsession = None
-# translates all relative URIs found in trees to absolute URIs
-def toAbsoluteURIs(trees, baseuri):
- for tree in trees:
- if isinstance(tree, str):
- continue
- for uriAttribute in uriAttributes:
- tags = tree.xpath(uriAttribute[0])
- for tag in tags:
- if tag.attrib.get(uriAttribute[1]) != None:
- if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '':
- tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]])
-
-
-def parseSite(site):
- file, content, titles, warning = None, None, None, None
-
- uri = site['uri']
- contenttype = site.get('type', 'html')
- contentregex = site.get('contentregex', '')
- titleregex = site.get('titleregex', '')
- enc = site.get('encoding', defaultEncoding)
-
- contentxpath = site.get('contentxpath', '')
- if contentxpath == '' and site.get('contentcss', '') != '':
- # CSS
- contentxpath = GenericTranslator().css_to_xpath(site.get('contentcss'))
- titlexpath = site.get('titlexpath', '')
- if titlexpath == '' and site.get('titlecss', '') != '':
- titlexpath = GenericTranslator().css_to_xpath(site.get('titlecss'))
-
- try:
-
- if uri.startswith(cmdscheme):
- # run command and retrieve output
- process = subprocess.Popen(uri[len(cmdscheme):], stdout=subprocess.PIPE, shell=True, close_fds=True)
- file = process.stdout
- else:
- # open website
- req = urllib.request.Request(uri)
- if 'user-agent' in site:
- req.add_header('User-Agent', site['user-agent'])
- if 'accept' in site:
- req.add_header('Accept', site['accept'])
- file = urllib.request.urlopen(req)
-
-
- if contenttype == 'text' or (contentxpath == '' and titlexpath == ''):
- contents = [file.read().decode(enc)]
- titles = []
- else:
- baseuri = uri
- if contenttype == 'html':
- parser = etree.HTMLParser(encoding=enc)
- else:
- parser = etree.XMLParser(recover=True, encoding=enc)
-
- tree = etree.parse(file, parser)
-
- # xpath
- contentresult = tree.xpath(contentxpath) if contentxpath else []
- titleresult = tree.xpath(titlexpath) if titlexpath else []
-
- # translate relative URIs to absolute URIs
- if contenttype == 'html':
- basetaglist = tree.xpath('/html/head/base')
- if len(basetaglist) != 0:
- baseuri = basetaglist[0].attrib['href']
- if len(contentresult) != 0:
- toAbsoluteURIs(contentresult, baseuri)
- if len(titleresult) != 0:
- toAbsoluteURIs(titleresult, baseuri)
-
- if contentxpath != '' and titlexpath != '' and len(contentresult) != len(titleresult):
- warning = 'WARNING: number of title blocks (' + str(len(titleresult)) + ') does not match number of content blocks (' + str(len(contentresult)) + ')'
- elif contentxpath and len(contentresult) == 0:
- warning = 'WARNING: content selector became invalid!'
- elif titlexpath and len(titleresult) == 0:
- warning = 'WARNING: title selector became invalid!'
- else:
- if len(contentresult) == 0:
- contentresult = titleresult
- if len(titleresult) == 0:
- titleresult = contentresult
-
- if isinstance(contentresult, str):
- contents = [contentresult]
- else:
- contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult]
- if isinstance(titleresult, str):
- titles = [getSubject(titleresult)]
- else:
- titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult]
-
- except IOError as e:
- warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e)
-
- if file is not None:
- file.close()
-
- if uri.startswith(cmdscheme) and process.wait() != 0:
- warning = 'WARNING: process terminated with an error'
-
- if warning:
- return {'content': content, 'titles': titles, 'warning': warning}
-
- # parse regex
- if contentregex:
- contents = [x for y in [re.findall(r'' + contentregex, c, re.S) for c in contents] for x in y]
- if titleregex:
- titles = [x for y in [re.findall(r'' + titleregex, c, re.S) for c in titles] for x in y]
-
- if contentregex and titleregex and len(contents) != len(titles):
- warning = 'WARNING: number of title blocks (' + str(len(titles)) + ') does not match number of content blocks (' + str(len(contents)) + ') after regex'
- elif contentregex and len(contents) == 0:
- warning = 'WARNING: content regex became invalid!'
- elif titleregex and len(titles) == 0:
- warning = 'WARNING: title regex became invalid!'
- else:
- if len(contents) == 0:
- contents = titles
- if len(titles) == 0:
- titles = [getSubject(c) for c in contents]
-
- return {'contents': contents, 'titles': titles, 'warning': warning}
-
-
-# returns a short subject line
-def getSubject(textContent):
- if textContent == None or textContent == '':
- return config.subjectPostfix
- textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip()
- return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent
-
-
# generates a new RSS feed item
def genFeedItem(subject, content, link, change):
- feeditem = etree.Element('item')
- titleitem = etree.Element('title')
- titleitem.text = subject + ' #' + str(change)
- feeditem.append(titleitem)
- linkitem = etree.Element('link')
- linkitem.text = link
- feeditem.append(linkitem)
- descriptionitem = etree.Element('description')
- descriptionitem.text = content
- feeditem.append(descriptionitem)
- guiditem = etree.Element('guid')
- guiditem.text = str(random.getrandbits(32))
- feeditem.append(guiditem)
- dateitem = etree.Element('pubDate')
- dateitem.text = strftime("%a, %d %b %Y %H:%M:%S %Z", time.localtime())
- feeditem.append(dateitem)
-
- return feeditem
+ feeditem = etree.Element('item')
+ titleitem = etree.Element('title')
+ titleitem.text = subject + ' #' + str(change)
+ feeditem.append(titleitem)
+ linkitem = etree.Element('link')
+ linkitem.text = link
+ feeditem.append(linkitem)
+ descriptionitem = etree.Element('description')
+ descriptionitem.text = content
+ feeditem.append(descriptionitem)
+ guiditem = etree.Element('guid')
+ guiditem.text = str(random.getrandbits(32))
+ feeditem.append(guiditem)
+ dateitem = etree.Element('pubDate')
+ dateitem.text = strftime("%a, %d %b %Y %H:%M:%S %Z", time.localtime())
+ feeditem.append(dateitem)
+
+ return feeditem
# sends mail notification
-def sendmail(receiver, subject, content, sendAsHtml, link):
- global mailsession
-
- if sendAsHtml:
- baseurl = None
- if link != None:
- content = '<p><a href="' + link + '">' + subject + '</a></p>\n' + content
- baseurl = urljoin(link, '/')
- mail = MIMEText('<html><head><title>' + subject + '</title>' + ('<base href="' + baseurl + '">' if baseurl else '') + '</head><body>' + content + '</body></html>', 'html', defaultEncoding)
- else:
- if link != None:
- content = link + '\n\n' + content
- mail = MIMEText(content, 'text', defaultEncoding)
+def sendmail(receiver, subject, content, sendAsHtml, link, encoding=None):
+ global mailsession, defaultEncoding
+
+ if encoding is None:
+ encoding = defaultEncoding
+
+ if sendAsHtml:
+ baseurl = None
+ if link is not None:
+ content = '<p><a href="' + link + '">' + subject + '</a></p>\n' + content
+ baseurl = urljoin(link, '/')
+ mail = MIMEText('<html><head><title>' + subject + '</title>' + ('<base href="' + baseurl + '">' if baseurl else '') + '</head><body>' + content + '</body></html>', 'html', encoding)
+ else:
+ if link is not None:
+ content = link + '\n\n' + content
+ mail = MIMEText(content, 'text', encoding)
+
+ mail['From'] = config.sender
+ mail['To'] = receiver
+ mail['Subject'] = Header(subject, encoding)
+
+ # initialize session once, not each time this method gets called
+ if mailsession is None:
+ mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
+ if config.useTLS:
+ mailsession.ehlo()
+ mailsession.starttls()
+ if config.smtpusername is not None:
+ mailsession.login(config.smtpusername, config.smtppwd)
+
+ mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
- mail['From'] = config.sender
- mail['To'] = receiver
- mail['Subject'] = Header(subject, defaultEncoding)
- # initialize session once, not each time this method gets called
- if mailsession is None:
- mailsession = smtplib.SMTP(config.smtphost, config.smtpport)
- if config.useTLS:
- mailsession.ehlo()
- mailsession.starttls()
- if config.smtpusername is not None:
- mailsession.login(config.smtpusername, config.smtppwd)
+# returns a list of all content that is stored locally for a specific site
+def getStoredHashes(name):
+ result = []
+ filename = os.path.join(config.workingDirectory, name + ".txt")
+ if os.path.exists(filename):
+ with open(filename, 'r') as thefile:
+ for line in thefile:
+ result.append(line.rstrip())
- mailsession.sendmail(config.sender, receiver.split(','), mail.as_string())
+ return result
-# returns a list of all content that is stored locally for a specific site
-def getFileContents(shortname):
- result = []
- for f in os.listdir('.'):
- if f.startswith(shortname + '.') and f.endswith('.txt'):
- file = open(f, 'rb')
- result.append(file.read().decode('utf-8'))
- file.close()
- return result
+# updates list of content that is stored locally for a specific site
+def storeHashes(name, contentHashes):
+ with open(os.path.join(config.workingDirectory, name + '.txt'), 'w') as thefile:
+ for h in contentHashes:
+ thefile.write(h + "\n")
-# updates list of content that is stored locally for a specific site
-def storeFileContents(shortname, parseResult):
- for f in os.listdir('.'):
- if f.startswith(shortname + '.') and f.endswith('.txt'):
- os.remove(f)
+def runParsers(parsers, contentList=None):
+ if contentList is None:
+ contentList = []
- i = 0
- for c in parseResult['contents']:
- file = open(shortname + '.' + str(i) + '.txt', 'wb')
- file.write(c.encode('utf-8'))
- file.close()
- i += 1
+ for parser in parsers:
+ contentList = parser.performAction(contentList)
+ return contentList
-def pollWebsites():
- # parse existing feed or create a new one
- if config.enableRSSFeed:
- if os.path.isfile(config.rssfile):
- feedXML = etree.parse(config.rssfile)
- else:
- feedXML = etree.parse(io.StringIO(emptyfeed))
-
- # start polling sites
- for site in config.sites:
-
- print('polling site [' + site['shortname'] + '] ...')
- parseResult = parseSite(site)
- receiver = site.get('receiver', config.receiver)
-
- # if something went wrong, notify the user
- if parseResult['warning']:
- subject = '[' + site['shortname'] + '] WARNING'
- print('WARNING: ' + parseResult['warning'])
- if config.enableMailNotifications:
- sendmail(receiver, subject, parseResult['warning'], False, None)
- if config.enableRSSFeed:
- feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0))
- else:
- # otherwise, check which parts of the site were updated
- changes = 0
- fileContents = getFileContents(site['shortname'])
- i = 0
- for content in parseResult['contents']:
- if content not in fileContents:
- changes += 1
-
- subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i]
- print(' ' + subject)
- if config.enableMailNotifications and len(fileContents) > 0:
- sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri'])
-
- if config.enableRSSFeed:
- feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes))
- i += 1
-
-
- if changes > 0:
- storeFileContents(site['shortname'], parseResult)
- print(' ' + str(changes) + ' updates')
-
- # store feed
- if config.enableRSSFeed:
- for o in feedXML.xpath('//channel/item[position()<last()-' + str(config.maxFeeds - 1) + ']'):
- o.getparent().remove(o)
- file = open(config.rssfile, 'w')
- file.write(etree.tostring(feedXML, pretty_print=True, xml_declaration=True, encoding=defaultEncoding).decode(defaultEncoding))
- file.close()
+def pollWebsites():
+ global defaultEncoding
+ # parse existing feed or create a new one
+ rssfile = config.rssfile
+ if not os.path.isabs(rssfile):
+ rssfile = os.path.join(config.workingDirectory, rssfile)
-if __name__ == "__main__":
+ if config.enableRSSFeed:
+ if os.path.isfile(rssfile):
+ feedXML = etree.parse(rssfile)
+ else:
+ feedXML = etree.parse(io.StringIO(emptyfeed))
- configMod = 'config'
- dryrun = None
+ # start polling sites
+ mailsSent = 0
+ for site in config.sites:
+ print('polling site [' + site['name'] + '] ...')
+ receiver = site.get('receiver', config.receiver)
try:
- opts, args = getopt.getopt(sys.argv[1:], 'hc:d:', ['help', 'config=', 'dry-run='])
- except getopt.GetoptError:
- print('Usage: mwc.py --config=config --dry-run=shortname')
- sys.exit(1)
- for opt, arg in opts:
- if opt == '-h':
- print('Usage: mwc.py --config=config')
- exit()
- elif opt in ('-c', '--config'):
- configMod = arg
- elif opt in ('-d', '--dry-run'):
- dryrun = arg
-
- config = importlib.import_module(configMod)
-
- if dryrun:
- for site in config.sites:
- if site['shortname'] == dryrun:
- parseResult = parseSite(site)
- print(parseResult)
- break
- else:
- try:
- pollWebsites()
- except:
- msg = str(sys.exc_info()[0]) + '\n\n' + traceback.format_exc()
- print(msg)
- if config.receiver != '':
- sendmail(config.receiver, '[mwc] Something went wrong ...', msg, False, None)
-
- if mailsession:
- mailsession.quit()
- mailsession = None
+ contentList = runParsers(site['parsers'])
+ except Exception as e:
+ # if something went wrong, notify the user
+ subject = '[' + site['name'] + '] WARNING'
+ print('WARNING: ' + str(e))
+ if config.enableMailNotifications:
+ if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession:
+ sendmail(receiver=receiver, subject=subject, content=str(e), sendAsHtml=False, link=None)
+ mailsSent = mailsSent + 1
+ if config.enableRSSFeed:
+ feedXML.xpath('//channel')[0].append(genFeedItem(subject, str(e), "", 0))
+ continue
+
+ sessionHashes = []
+ changedContents = []
+ fileHashes = getStoredHashes(site['name'])
+ for content in contentList:
+
+ contenthash = hashlib.md5(content.content.encode(content.encoding)).hexdigest()
+ if contenthash not in fileHashes:
+ if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession:
+ sessionHashes.append(contenthash)
+ changedContents.append(content)
+
+ subject = '[' + site['name'] + '] ' + content.title
+ print(' ' + subject)
+ if config.enableMailNotifications and len(fileHashes) > 0:
+ sendAsHtml = (content.contenttype == 'html')
+ sendmail(receiver=receiver, subject=subject, content=content.content, sendAsHtml=sendAsHtml, link=content.uri, encoding=content.encoding)
+ mailsSent = mailsSent + 1
+
+ if config.enableRSSFeed:
+ feedXML.xpath('//channel')[0].append(genFeedItem(subject, content.content, content.uri, len(changedContents)))
+ else:
+ sessionHashes.append(contenthash)
+
+ if 'postRun' in site:
+ runParsers(site['postRun'], changedContents)
+
+ if len(changedContents) > 0:
+ storeHashes(site['name'], sessionHashes)
+ print(' ' + str(len(changedContents)) + ' updates')
+
+ # store feed
+ if config.enableRSSFeed:
+ for o in feedXML.xpath('//channel/item[position()<last()-' + str(config.maxFeeds - 1) + ']'):
+ o.getparent().remove(o)
+ with open(rssfile, 'w') as thefile:
+ thefile.write(etree.tostring(feedXML, pretty_print=True, xml_declaration=True, encoding=defaultEncoding).decode(defaultEncoding, errors='ignore'))
+
+if __name__ == "__main__":
+ configMod = 'config'
+ dryrun = None
+
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], 'hc:d:', ['help', 'config=', 'dry-run='])
+ except getopt.GetoptError:
+ print('Usage: mwc.py --config=config --dry-run=name')
+ sys.exit(1)
+ for opt, arg in opts:
+ if opt == '-h':
+ print('Usage: mwc.py --config=config')
+ exit()
+ elif opt in ('-c', '--config'):
+ configMod = arg
+ elif opt in ('-d', '--dry-run'):
+ dryrun = arg
+
+ config = importlib.import_module(configMod)
+
+ if dryrun:
+ for thesite in config.sites:
+ if thesite['name'] == dryrun:
+ parseResult = runParsers(thesite['parsers'])
+ for p in parseResult:
+ print(p.title)
+ print(p.content)
+ print(str(len(parseResult)) + " results")
+ break
+ else:
+ try:
+ pollWebsites()
+ except:
+ msg = str(sys.exc_info()[0]) + '\n\n' + traceback.format_exc()
+ print(msg)
+ if config.receiver != '':
+ sendmail(receiver=config.receiver, subject='[mwc] Something went wrong ...', content=msg, sendAsHtml=False, link=None)
+
+ if mailsession:
+ mailsession.quit()
+ mailsession = None
diff --git a/mwcfeedserver.py b/mwcfeedserver.py
index 98093b9..0bca4b0 100755
--- a/mwcfeedserver.py
+++ b/mwcfeedserver.py
@@ -1,35 +1,38 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
-# Copyright: (2013-2014) Michael Till Beck <Debianguru@gmx.de>
+# Copyright: (2013-2017) Michael Till Beck <Debianguru@gmx.de>
# License: GPL-2.0+
+
import http.server
import socketserver
import importlib
import sys
import getopt
+
bind = 'localhost'
port = 8000
configMod = 'config'
try:
- opts, args = getopt.getopt(sys.argv[1:], 'hc:b:p:', ['help', 'config=', 'bind=', 'port='])
+ opts, args = getopt.getopt(sys.argv[1:], 'hc:b:p:', ['help', 'config=', 'bind=', 'port='])
except getopt.GetoptError:
- print('Usage: FeedServer.py --config=config --port=8000')
- sys.exit(1)
+ print('Usage: FeedServer.py --config=config --port=8000 --bind=localhost')
+ sys.exit(1)
for opt, arg in opts:
- if opt == '-h':
- print('Usage: FeedServer.py --config=config --bind=localhost --port=8000')
- exit()
- elif opt in ('-c', '--config'):
- configMod = arg
- elif opt in ('-b', '--bind'):
- bind = arg
- elif opt in ('-p', '--port'):
- port = int(arg)
+ if opt == '-h':
+ print('Usage: FeedServer.py --config=config --bind=localhost --port=8000')
+ exit()
+ elif opt in ('-c', '--config'):
+ configMod = arg
+ elif opt in ('-b', '--bind'):
+ bind = arg
+ elif opt in ('-p', '--port'):
+ port = int(arg)
config = importlib.import_module(configMod)
diff --git a/mwctools.py b/mwctools.py
new file mode 100755
index 0000000..cefbbf0
--- /dev/null
+++ b/mwctools.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright: (2013-2017) Michael Till Beck <Debianguru@gmx.de>
+# License: GPL-2.0+
+
+
+import urllib.request
+import urllib.error
+import urllib.parse
+import subprocess
+
+from lxml import etree
+from cssselect import GenericTranslator
+import re
+
+
+# Attributes in HTML files storing URI values. These values are automatically translated to absolute URIs.
+uriAttributes = [['//img[@src]', 'src'], ['//a[@href]', 'href']]
+
+maxTitleLength = 150
+
+
+class Parser:
+ # input: [Content], output: [Content]
+ def performAction(self, contentList):
+ pass
+
+
+class Receiver(Parser):
+ def __init__(self, uri):
+ self.uri = uri
+
+
+class Content:
+ def __init__(self, uri, encoding, title, content, contenttype):
+ self.uri = uri
+ self.encoding = encoding
+ self.title = title
+ self.content = content
+ self.contenttype = contenttype
+
+
+# returns a short subject line
+def getSubject(textContent):
+ global maxTitleLength
+
+ if textContent is None or len(textContent.strip()) == 0:
+ return 'Website has been updated'
+ textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip()
+ return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent
+
+
+# translates all relative URIs found in trees to absolute URIs
+def toAbsoluteURIs(trees, baseuri):
+ global uriAttributes
+
+ for tree in trees:
+ if isinstance(tree, str):
+ continue
+ for uriAttribute in uriAttributes:
+ tags = tree.xpath(uriAttribute[0])
+ for tag in tags:
+ if tag.attrib.get(uriAttribute[1]) is not None:
+ if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '':
+ tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]])
+
+
+class URLReceiver(Receiver):
+ def __init__(self, uri, contenttype='html', encoding='utf-8', userAgent=None, accept=None):
+ super().__init__(uri)
+ self.contenttype = contenttype
+ self.encoding = encoding
+ self.userAgent = userAgent
+ self.accept = accept
+
+ # input: [Content], output: [Content]
+ def performAction(self, contentList=None):
+ if contentList is None:
+ contentList = []
+
+ # open website
+ req = urllib.request.Request(self.uri)
+ if self.userAgent is not None:
+ req.add_header('User-Agent', self.userAgent)
+ if self.accept is not None:
+ req.add_header('Accept', self.accept)
+
+ with urllib.request.urlopen(req) as thefile:
+ filecontent = thefile.read().decode(self.encoding, errors='ignore')
+ contentList.append(Content(uri=self.uri, encoding=self.encoding, title=None, content=filecontent, contenttype=self.contenttype))
+
+ return contentList
+
+
+class CommandReceiver(Receiver):
+ def __init__(self, command, contenttype='text', encoding='utf-8'):
+ super().__init__(command)
+ self.encoding = encoding
+ self.command = command
+ self.contenttype = contenttype
+
+ # input: [Content], output: [Content]
+ def performAction(self, contentList=None):
+ if contentList is None:
+ contentList = []
+
+ # run command and retrieve output
+ process = subprocess.Popen(self.command, stdout=subprocess.PIPE, shell=True, close_fds=True)
+ thefile = process.stdout
+ result = thefile.read().decode(self.encoding, errors='ignore')
+ thefile.close()
+
+ if process.wait() != 0:
+ raise Exception("process terminated with an error")
+
+ contentList.append(Content(uri=None, encoding=self.encoding, title=None, content=result, contenttype=self.contenttype))
+ return contentList
+
+
+class XPathParser(Parser):
+ def __init__(self, contentxpath, titlexpath=None):
+ self.contentxpath = contentxpath
+ self.titlexpath = titlexpath
+
+ # input: [Content], output: [Content]
+ def performAction(self, contentList):
+ result = []
+ for content in contentList:
+ result.extend(self.parseOneObject(content))
+ return result
+
+ # input: Content, output: [Content]
+ def parseOneObject(self, content):
+ baseuri = content.uri
+ if content.contenttype == 'html':
+ parser = etree.HTMLParser(encoding=content.encoding)
+ else:
+ parser = etree.XMLParser(recover=True, encoding=content.encoding)
+
+ tree = etree.fromstring(content.content, parser=parser)
+
+ # xpath
+ contentresult = [] if self.contentxpath is None else tree.xpath(self.contentxpath)
+ titleresult = [] if self.titlexpath is None else tree.xpath(self.titlexpath)
+
+ # translate relative URIs to absolute URIs
+ if content.contenttype == 'html':
+ basetaglist = tree.xpath('/html/head/base')
+ if len(basetaglist) != 0:
+ baseuri = basetaglist[0].attrib['href']
+ if len(contentresult) != 0:
+ toAbsoluteURIs(contentresult, baseuri)
+ if len(titleresult) != 0:
+ toAbsoluteURIs(titleresult, baseuri)
+
+ if self.contentxpath and len(contentresult) == 0:
+ raise Exception('WARNING: content selector became invalid!')
+ if self.titlexpath and len(titleresult) == 0:
+ raise Exception('WARNING: title selector became invalid!')
+
+ contents = []
+ titles = []
+ if isinstance(contentresult, str):
+ contents = [contentresult]
+ else:
+ if len(contentresult) == 0:
+ contentresult = titleresult
+ contents = [etree.tostring(s, encoding=content.encoding, pretty_print=True).decode(content.encoding, errors='ignore') for s in contentresult]
+
+ if isinstance(titleresult, str):
+ titles = [getSubject(titleresult)]*len(contents)
+ else:
+ if len(titleresult) == 0 or len(titleresult) != len(contentresult):
+ titleresult = contentresult
+ titles = [getSubject(etree.tostring(s, method='text', encoding=content.encoding).decode(content.encoding, errors='ignore')) for s in titleresult]
+
+ result = []
+ for i in range(0, len(contents)):
+ result.append(Content(uri=content.uri, encoding=content.encoding, title=titles[i], content=contents[i], contenttype=content.contenttype))
+
+ return result
+
+
+class CSSParser(Parser):
+ def __init__(self, contentcss, titlecss=None):
+ contentxpath = GenericTranslator().css_to_xpath(contentcss)
+ titlexpath = None
+ if titlecss is not None:
+ titlexpath = GenericTranslator().css_to_xpath(titlecss)
+
+ self.xpathparser = XPathParser(contentxpath=contentxpath, titlexpath=titlexpath)
+
+ # input: [Content], output: [Content]
+ def performAction(self, contentList):
+ return self.xpathparser.performAction(contentList)
+
+
+class RegExParser(Parser):
+ def __init__(self, contentregex, titleregex=None):
+ self.contentregex = contentregex
+ self.titleregex = titleregex
+
+ # input: [Content], output: [Content]
+ def performAction(self, contentList):
+ result = []
+ for content in contentList:
+ result.extend(self.parseOneObject(content))
+ return result
+
+ # input: Content, output: [Content]
+ def parseOneObject(self, content):
+ contents = []
+ titles = []
+ if self.contentregex is not None:
+ for c in re.findall(r'' + self.contentregex, content.content, re.M):
+ if len(c.strip()) != 0:
+ contents.append(c)
+ if self.titleregex is not None:
+ for c in re.findall(r'' + self.titleregex, content.title, re.M):
+ if len(c.strip()) != 0:
+ titles.append(c)
+
+ if self.contentregex is not None and len(contents) == 0:
+ raise Exception('WARNING: content regex became invalid!')
+ elif self.titleregex is not None and len(titles) == 0:
+ raise Exception('WARNING: title regex became invalid!')
+ else:
+ if len(contents) == 0:
+ contents = titles
+ if len(titles) == 0 or len(titles) != len(contents):
+ titles = [getSubject(c) for c in contents]
+
+ result = []
+ for i in range(0, len(contents)):
+ result.append(Content(uri=content.uri, encoding=content.encoding, title=titles[i], content=contents[i], contenttype=content.contenttype))
+
+ return result
+