New upstream version 2.0.4upstream/2.0.4

author: Jörg Frings-Fürst <debian@jff-webhosting.net> 2017-10-01 18:50:17 +0200
committer: Jörg Frings-Fürst <debian@jff-webhosting.net> 2017-10-01 18:50:17 +0200
commit: 04b13e003d6af0de21e6c59e411ffee5b97b6134 (patch)
tree: 6ea896c823290df0c598f82f8daa61713de373f5
parent: f8f939634396158de53fb26fa7f9a539a92fb219 (diff)
5 files changed, 433 insertions, 298 deletions
diff --git a/README.md b/README.md
index 8e78da6..69718e7 100644
--- a/README.md
+++ b/README.md
@@ -19,20 +19,23 @@ Some examples:
 <code>
 sites = [
 
-          {'shortname': 'mywebsite1',
-           'uri': 'http://www.mywebsite1.com/info',
-           'contentcss': 'div'},
-
-          {'shortname': 'mywebsite2',
-           'uri': 'http://www.mywebsite2.com/info',
-           'contentxpath': '//*[contains(concat(\' \', normalize-space(@class), \' \'), \' news-list-container \')]',
-           'titlexpath': '//title'},
-
-          {'shortname': 'mywebsite3',
-           'uri': 'http://www.mywebsite3.com/info',
-           'type': 'text',
-           'contentregex': 'Version\"\:\d*\.\d*',
-           'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'}
+         {'name': 'example-css',
+          'parsers': [uri(uri='https://github.com/mtill', contenttype='html'),
+                      css(contentcss='div')
+                     ]
+         },
+
+         {'name': 'example-xpath',
+          'parsers': [uri(uri='https://example-webpage.com/test', contenttype='html'),
+                      xpath(contentxpath='//div[contains(concat(\' \', normalize-space(@class), \' \'), \' package-version-header \')]')
+                     ]
+         },
+
+         {'name': 'my-script',
+          'parsers': [command(command='/home/user/script.sh', contenttype='text'),
+                      regex(contentregex='^.*$')
+                     ]
+         }
 
 ]
 </code>
@@ -40,31 +43,55 @@ sites = [
 
  * parameters:
 
-   * <b>shortname</b>  
-     short name of the entry, used as an identifier when sending email notifications
+   * <b>name</b>  
+     name of the entry, used as an identifier when sending email notifications
+   * <b>receiver</b> (optional)  
+     Overrides global receiver specification.
+
+ * parameters for the URL receiver:
+
    * <b>uri</b>  
-     URI of the website; If the scheme of the uri is 'cmd://', the string is interpreted as a command and the standard output (stdout) is parsed.
-   * <b>type</b> (optional; default: 'html')  
+     URI of the website
+   * <b>contenttype</b> (optional; default: 'html')  
      content type, e.g., 'xml'/'html'/'text'.
-   * <b>contentxpath</b> / <b>titlexpath</b> (optional)  
-     XPath expression for the content/title sections to extract. If you prefer, you could use contentcss/titlecss instead.
-   * <b>contentcss</b> / <b>titlecss</b> (optional)  
-     CSS expression for the content/title sections to extract. This is ignored if there is a corresponding XPath definition.
-   * <b>contentregex</b> / <b>titleregex</b> (optional)  
-     Regular expression. If XPath/CSS selector is defined, the regular expression is applied afterwards.
-   * <b>encoding</b> (optional; default: 'utf-8')  
+   * <b>enc</b> (optional; default: 'utf-8')  
      Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'.
-   * <b>splitregex</b> (optional)  
-     only works if type is set to 'text'; defines that content should be split to chunks based on the defined regex expression.
-   * <b>receiver</b> (optional)  
-     Overrides global receiver specification.
-   * <b>user-agent</b> (optional)  
+   * <b>userAgent</b> (optional)  
      Defines the user agent string, e.g.,  
-     'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'
+     'userAgent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'
    * <b>accept</b> (optional)  
      Defines the accept string, e.g.,  
      'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
 
+ * parameters for the Command receiver
+
+   * <b>command</b>  
+     the command
+   * <b>contenttype</b> (optional; default: 'text')  
+     content type, e.g., 'xml'/'html'/'text'.
+   * <b>enc</b> (optional; default: 'utf-8')  
+     Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'.
+
+ * parameters for the XPath parser:
+
+   * <b>contentxpath</b>  
+     XPath expression for the content sections to extract
+   * <b>titlexpath</b> (optional)  
+     XPath expression for the title sections to extract
+
+ * parameters for the CSS parser:
+
+   * <b>contentcss</b>  
+     CSS expression for the content sections to extract
+   * <b>titlecss</b> (optional)  
+     CSS expression for the title sections to extract
+
+ * parameters for the RegEx parser:
+
+   * <b>contentregex</b>  
+     Regular expression for content parsing
+   * <b>titleregex</b> (optional)  
+     Regular expression for title parsing
 
  * We collect some XPath/CSS snippets at this place: <a href="https://github.com/Debianguru/MailWebsiteChanges/wiki/snippets">Snippet collection</a> - please feel free to add your own definitions!
 
diff --git a/config_template.py b/config_template.py
index f394e52..02788bd 100644..100755
--- a/config_template.py
+++ b/config_template.py
@@ -1,47 +1,49 @@
-import os.path
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
 
-# Copyright: (2013-2014) Michael Till Beck <Debianguru@gmx.de>
+# Copyright: (2013-2017) Michael Till Beck <Debianguru@gmx.de>
 # License: GPL-2.0+
 
-#We collect xpath snippets at this place: <a href="https://github.com/Debianguru/MailWebsiteChanges/wiki/snippets">Snippet collection</a> - please feel free to add your own definitions!
+
+# We collect xpath snippets at this place:
+# <a href="https://github.com/Debianguru/MailWebsiteChanges/wiki/snippets">Snippet collection</a>
+# Feel free to contribute!
+
+
+from mwctools import URLReceiver as uri
+from mwctools import CommandReceiver as command
+from mwctools import XPathParser as xpath
+from mwctools import CSSParser as css
+from mwctools import RegExParser as regex
+from mwctools import Content
+from mwctools import Parser
+
 
 sites = [
 
-          {'shortname': 'mywebsite1',
-           'uri': 'http://www.mywebsite1.com/info',
-           'type': 'html',
-           'titlexpath': '//h1',
-           'contentxpath': '//div',
-           'titleregex': '',
-           'contentregex': '',
-           'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0',
-           'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
-           'encoding': 'utf-8'},
-
-          {'shortname': 'mywebsite2',
-           'uri': 'http://www.mywebsite2.com/info',
-           'type': 'html',
-           'contentxpath': '//*[contains(concat(\' \', normalize-space(@class), \' \'), \' news-list-container \')]',
-           'regex': '',
-           'encoding': 'utf-8'},
-
-          {'shortname': 'mywebsite3',
-           'uri': 'http://www.mywebsite3.com/info',
-           'type': 'text',
-           'contentxpath': '',
-           'contentregex': 'Version\"\:\d*\.\d*',
-           'encoding': 'utf-8'},
-
-          {'shortname': 'lscmd',
-           'uri': 'cmd://ls -l /home/pi',
-           'contentregex': '.*Desktop.*'
-          }
+         {'name': 'example-css',
+          'parsers': [uri(uri='https://github.com/mtill', contenttype='html'),
+                      css(contentcss='div')
+                     ]
+         },
+
+         {'name': 'example-xpath',
+          'parsers': [uri(uri='https://example-webpage.com/test', contenttype='html'),
+                      xpath(contentxpath='//div[contains(concat(\' \', normalize-space(@class), \' \'), \' package-version-header \')]')
+                     ]
+         },
+
+         {'name': 'my-script',
+          'parsers': [command(command='/home/user/script.sh', contenttype='text'),
+                      regex(contentregex='^.*$')
+                     ]
+         }
 
 ]
 
-subjectPostfix = 'A website has been updated!'
+workingDirectory = '/path-to-data-dir/MailWebsiteChanges-data'
 
-enableMailNotifications = True
+enableMailNotifications = False
 maxMailsPerSession = -1
 sender = 'me@mymail.com'
 smtphost = 'mysmtpprovider.com'
@@ -51,9 +53,7 @@ smtpusername = sender
 smtppwd = 'mypassword'
 receiver = 'me2@mymail.com'
 
-os.chdir('/var/cache/mwc')
-
-enableRSSFeed = True
+enableRSSFeed = False
 rssfile = 'feed.xml'
 maxFeeds = 100
 
diff --git a/mwc.py b/mwc.py
index 4df4799..6a48317 100755
--- a/mwc.py
+++ b/mwc.py
@@ -1,15 +1,11 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-# Copyright: (2013-2015) Michael Till Beck <Debianguru@gmx.de>
+# Copyright: (2013-2017) Michael Till Beck <Debianguru@gmx.de>
 # License: GPL-2.0+
 
-import urllib.request, urllib.error, urllib.parse
-import urllib.parse
-from lxml import etree
-from cssselect import GenericTranslator
-import re
 import io
+from lxml import etree
 import hashlib
 
 import smtplib
@@ -22,8 +18,6 @@ import sys
 import getopt
 import traceback
 
-import subprocess
-
 import time
 from time import strftime
 import random
@@ -32,164 +26,20 @@ import importlib
 config = None
 
 defaultEncoding = 'utf-8'
-maxTitleLength = 150
 
 # this is how an empty RSS feed looks like
 emptyfeed = """<?xml version="1.0"?>
 <rss version="2.0">
  <channel>
   <title>MailWebsiteChanges Feed</title>
-  <link>https://github.com/Debianguru/MailWebsiteChanges</link>
+  <link>https://github.com/mtill/MailWebsiteChanges</link>
   <description>MailWebsiteChanges Feed</description>
  </channel>
 </rss>"""
 
-# Attributes in HTML files storing URI values. These values are automatically translated to absolute URIs.
-uriAttributes = [['//img[@src]', 'src'], ['//a[@href]', 'href']]
-cmdscheme = 'cmd://'
-
 mailsession = None
 
 
-# translates all relative URIs found in trees to absolute URIs
-def toAbsoluteURIs(trees, baseuri):
-    for tree in trees:
-        if isinstance(tree, str):
-            continue
-        for uriAttribute in uriAttributes:
-            tags = tree.xpath(uriAttribute[0])
-            for tag in tags:
-                if tag.attrib.get(uriAttribute[1]) != None:
-                    if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '':
-                        tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]])
-
-
-def parseSite(site):
-    global defaultEncoding
-    file, content, titles, warning = None, None, None, None
-
-    uri = site['uri']
-    contenttype = site.get('type', 'html')
-    contentregex = site.get('contentregex', '')
-    titleregex = site.get('titleregex', '')
-    splitregex = site.get('splitregex', '')
-    enc = site.get('encoding', defaultEncoding)
-
-    contentxpath = site.get('contentxpath', '')
-    if contentxpath == '' and site.get('contentcss', '') != '':
-        # CSS
-        contentxpath = GenericTranslator().css_to_xpath(site.get('contentcss'))
-    titlexpath = site.get('titlexpath', '')
-    if titlexpath == '' and site.get('titlecss', '') != '':
-        titlexpath = GenericTranslator().css_to_xpath(site.get('titlecss'))
-
-    try:
-
-        if uri.startswith(cmdscheme):
-            # run command and retrieve output
-            process = subprocess.Popen(uri[len(cmdscheme):], stdout=subprocess.PIPE, shell=True, close_fds=True)
-            file = process.stdout
-        else:
-            # open website
-            req = urllib.request.Request(uri)
-            if 'user-agent' in site:
-                req.add_header('User-Agent', site['user-agent'])
-            if 'accept' in site:
-                req.add_header('Accept', site['accept'])
-            file = urllib.request.urlopen(req)
-
-
-        if contenttype == 'text' or (contentxpath == '' and titlexpath == ''):
-            thefullcontent = file.read().decode(enc, errors='ignore')
-            contents = [thefullcontent]
-            if splitregex != '':
-                contents = thefullcontent.split(splitregex)
-            titles = []
-        else:
-            baseuri = uri
-            if contenttype == 'html':
-                parser = etree.HTMLParser(encoding=enc)
-            else:
-                parser = etree.XMLParser(recover=True, encoding=enc)
-
-            tree = etree.parse(file, parser)
-
-            # xpath
-            contentresult = tree.xpath(contentxpath) if contentxpath else []
-            titleresult = tree.xpath(titlexpath) if titlexpath else []
-
-            # translate relative URIs to absolute URIs
-            if contenttype == 'html':
-                basetaglist = tree.xpath('/html/head/base')
-                if len(basetaglist) != 0:
-                    baseuri = basetaglist[0].attrib['href']
-                if len(contentresult) != 0:
-                    toAbsoluteURIs(contentresult, baseuri)
-                if len(titleresult) != 0:
-                    toAbsoluteURIs(titleresult, baseuri)
-
-            if contentxpath != '' and titlexpath != '' and len(contentresult) != len(titleresult):
-                warning = 'WARNING: number of title blocks (' + str(len(titleresult)) + ') does not match number of content blocks (' + str(len(contentresult)) + ')'
-            elif contentxpath and len(contentresult) == 0:
-                warning = 'WARNING: content selector became invalid!'
-            elif titlexpath and len(titleresult) == 0:
-                warning = 'WARNING: title selector became invalid!'
-            else:
-                if len(contentresult) == 0:
-                    contentresult = titleresult
-                if len(titleresult) == 0:
-                    titleresult = contentresult
-
-            if isinstance(contentresult, str):
-                contents = [contentresult]
-            else:
-                contents = [etree.tostring(s, encoding=enc, pretty_print=True).decode(enc, errors='ignore') for s in contentresult]
-            if isinstance(titleresult, str):
-                titles = [getSubject(titleresult)]
-            else:
-                titles = [getSubject(etree.tostring(s, method='text', encoding=enc).decode(enc, errors='ignore')) for s in titleresult]
-
-    except IOError as e:
-        warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e)
-
-    if file is not None:
-        file.close()
-
-    if uri.startswith(cmdscheme) and process.wait() != 0:
-        warning = 'WARNING: process terminated with an error'
-
-    if warning:
-        return {'content': content, 'titles': titles, 'warning': warning}
-
-    # parse regex
-    if contentregex:
-        contents = [x for y in [re.findall(r'' + contentregex, c, re.S) for c in contents] for x in y]
-    if titleregex:
-        titles = [x for y in [re.findall(r'' + titleregex, c, re.S) for c in titles] for x in y]
-
-    if contentregex and titleregex and len(contents) != len(titles):
-        warning = 'WARNING: number of title blocks (' + str(len(titles)) + ') does not match number of content blocks (' + str(len(contents)) + ') after regex'
-    elif contentregex and len(contents) == 0:
-        warning = 'WARNING: content regex became invalid!'
-    elif titleregex and len(titles) == 0:
-        warning = 'WARNING: title regex became invalid!'
-    else:
-        if len(contents) == 0:
-            contents = titles
-        if len(titles) == 0:
-            titles = [getSubject(c) for c in contents]
-
-    return {'contents': contents, 'titles': titles, 'warning': warning}
-
-
-# returns a short subject line
-def getSubject(textContent):
-    if textContent == None or textContent == '':
-        return config.subjectPostfix
-    textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip()
-    return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent
-
-
 # generates a new RSS feed item
 def genFeedItem(subject, content, link, change):
     feeditem = etree.Element('item')
@@ -213,23 +63,26 @@ def genFeedItem(subject, content, link, change):
 
 
 # sends mail notification
-def sendmail(receiver, subject, content, sendAsHtml, link):
+def sendmail(receiver, subject, content, sendAsHtml, link, encoding=None):
     global mailsession, defaultEncoding
 
+    if encoding is None:
+        encoding = defaultEncoding
+
     if sendAsHtml:
         baseurl = None
-        if link != None:
+        if link is not None:
             content = '<p><a href="' + link + '">' + subject + '</a></p>\n' + content
             baseurl = urljoin(link, '/')
-        mail = MIMEText('<html><head><title>' + subject + '</title>' + ('<base href="' + baseurl + '">' if baseurl else '') + '</head><body>' + content + '</body></html>', 'html', defaultEncoding)
+        mail = MIMEText('<html><head><title>' + subject + '</title>' + ('<base href="' + baseurl + '">' if baseurl else '') + '</head><body>' + content + '</body></html>', 'html', encoding)
     else:
-        if link != None:
+        if link is not None:
             content = link + '\n\n' + content
-        mail = MIMEText(content, 'text', defaultEncoding)
+        mail = MIMEText(content, 'text', encoding)
 
     mail['From'] = config.sender
     mail['To'] = receiver
-    mail['Subject'] = Header(subject, defaultEncoding)
+    mail['Subject'] = Header(subject, encoding)
 
     # initialize session once, not each time this method gets called
     if mailsession is None:
@@ -244,103 +97,114 @@ def sendmail(receiver, subject, content, sendAsHtml, link):
 
 
 # returns a list of all content that is stored locally for a specific site
-def getStoredHashes(shortname):
-
+def getStoredHashes(name):
     result = []
-    filename = shortname + ".txt"
+    filename = os.path.join(config.workingDirectory, name + ".txt")
     if os.path.exists(filename):
-        with open(filename, 'r') as file:
-            for line in file:
+        with open(filename, 'r') as thefile:
+            for line in thefile:
                 result.append(line.rstrip())
 
     return result
 
 
 # updates list of content that is stored locally for a specific site
-def storeHashes(shortname, contentHashes):
-
-    with open(shortname + '.txt', 'w') as file:
+def storeHashes(name, contentHashes):
+    with open(os.path.join(config.workingDirectory, name + '.txt'), 'w') as thefile:
         for h in contentHashes:
-            file.write(h + "\n")
+            thefile.write(h + "\n")
+
+
+def runParsers(parsers, contentList=None):
+    if contentList is None:
+        contentList = []
+
+    for parser in parsers:
+        contentList = parser.performAction(contentList)
+
+    return contentList
 
 
 def pollWebsites():
     global defaultEncoding
 
     # parse existing feed or create a new one
+    rssfile = config.rssfile
+    if not os.path.isabs(rssfile):
+        rssfile = os.path.join(config.workingDirectory, rssfile)
+
     if config.enableRSSFeed:
-        if os.path.isfile(config.rssfile):
-            feedXML = etree.parse(config.rssfile)
+        if os.path.isfile(rssfile):
+            feedXML = etree.parse(rssfile)
         else:
             feedXML = etree.parse(io.StringIO(emptyfeed))
 
     # start polling sites
     mailsSent = 0
     for site in config.sites:
-        print('polling site [' + site['shortname'] + '] ...')
-        sessionHashes = []
-        parseResult = parseSite(site)
+        print('polling site [' + site['name'] + '] ...')
         receiver = site.get('receiver', config.receiver)
 
-        # if something went wrong, notify the user
-        if parseResult['warning']:
-            subject = '[' + site['shortname'] + '] WARNING'
-            print('WARNING: ' + parseResult['warning'])
+        try:
+            contentList = runParsers(site['parsers'])
+        except Exception as e:
+            # if something went wrong, notify the user
+            subject = '[' + site['name'] + '] WARNING'
+            print('WARNING: ' + str(e))
             if config.enableMailNotifications:
                 if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession:
-                    sendmail(receiver, subject, parseResult['warning'], False, None)
+                    sendmail(receiver=receiver, subject=subject, content=str(e), sendAsHtml=False, link=None)
                     mailsSent = mailsSent + 1
             if config.enableRSSFeed:
-                feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0))
-        else:
-            # otherwise, check which parts of the site were updated
-            changes = 0
-            fileHashes = getStoredHashes(site['shortname'])
-            i = 0
-            for content in parseResult['contents']:
-
-                contenthash = hashlib.md5(content.encode(defaultEncoding)).hexdigest()
-                if contenthash not in fileHashes:
-                    if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession:
-                        changes += 1
-                        sessionHashes.append(contenthash)
-
-                        subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i]
-                        print('    ' + subject)
-                        if config.enableMailNotifications and len(fileHashes) > 0:
-                            sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri'])
-                            mailsSent = mailsSent + 1
-
-                        if config.enableRSSFeed:
-                            feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes))
-                else:
+                feedXML.xpath('//channel')[0].append(genFeedItem(subject, str(e), "", 0))
+            continue
+
+        sessionHashes = []
+        changedContents = []
+        fileHashes = getStoredHashes(site['name'])
+        for content in contentList:
+
+            contenthash = hashlib.md5(content.content.encode(content.encoding)).hexdigest()
+            if contenthash not in fileHashes:
+                if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession:
                     sessionHashes.append(contenthash)
+                    changedContents.append(content)
 
-                i += 1
+                    subject = '[' + site['name'] + '] ' + content.title
+                    print('    ' + subject)
+                    if config.enableMailNotifications and len(fileHashes) > 0:
+                        sendAsHtml = (content.contenttype == 'html')
+                        sendmail(receiver=receiver, subject=subject, content=content.content, sendAsHtml=sendAsHtml, link=content.uri, encoding=content.encoding)
+                        mailsSent = mailsSent + 1
 
+                    if config.enableRSSFeed:
+                        feedXML.xpath('//channel')[0].append(genFeedItem(subject, content.content, content.uri, len(changedContents)))
+            else:
+                sessionHashes.append(contenthash)
+
+        if 'postRun' in site:
+            runParsers(site['postRun'], changedContents)
 
-            if changes > 0:
-                storeHashes(site['shortname'], sessionHashes)
-                print('        ' + str(changes) + ' updates')
+        if len(changedContents) > 0:
+            storeHashes(site['name'], sessionHashes)
+            print('        ' + str(len(changedContents)) + ' updates')
 
     # store feed
     if config.enableRSSFeed:
         for o in feedXML.xpath('//channel/item[position()<last()-' + str(config.maxFeeds - 1) + ']'):
             o.getparent().remove(o)
-        file = open(config.rssfile, 'w')
-        file.write(etree.tostring(feedXML, pretty_print=True, xml_declaration=True, encoding=defaultEncoding).decode(defaultEncoding, errors='ignore'))
-        file.close()
+        with open(rssfile, 'w') as thefile:
+            thefile.write(etree.tostring(feedXML, pretty_print=True, xml_declaration=True, encoding=defaultEncoding).decode(defaultEncoding, errors='ignore'))
 
 
 if __name__ == "__main__":
-
     configMod = 'config'
     dryrun = None
 
     try:
         opts, args = getopt.getopt(sys.argv[1:], 'hc:d:', ['help', 'config=', 'dry-run='])
     except getopt.GetoptError:
-        print('Usage: mwc.py --config=config --dry-run=shortname')
+        print('Usage: mwc.py --config=config --dry-run=name')
         sys.exit(1)
     for opt, arg in opts:
         if opt == '-h':
@@ -354,11 +218,13 @@ if __name__ == "__main__":
     config = importlib.import_module(configMod)
 
     if dryrun:
-        for site in config.sites:
-            if site['shortname'] == dryrun:
-                parseResult = parseSite(site)
-                print(parseResult)
-                print(str(len(parseResult['contents'])) + " results")
+        for thesite in config.sites:
+            if thesite['name'] == dryrun:
+                parseResult = runParsers(thesite['parsers'])
+                for p in parseResult:
+                    print(p.title)
+                    print(p.content)
+                print(str(len(parseResult)) + " results")
                 break
     else:
         try:
@@ -367,7 +233,7 @@ if __name__ == "__main__":
             msg = str(sys.exc_info()[0]) + '\n\n' + traceback.format_exc()
             print(msg)
             if config.receiver != '':
-                sendmail(config.receiver, '[mwc] Something went wrong ...', msg, False, None)
+                sendmail(receiver=config.receiver, subject='[mwc] Something went wrong ...', content=msg, sendAsHtml=False, link=None)
 
         if mailsession:
             mailsession.quit()
diff --git a/mwcfeedserver.py b/mwcfeedserver.py
index 98093b9..0bca4b0 100755
--- a/mwcfeedserver.py
+++ b/mwcfeedserver.py
@@ -1,35 +1,38 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
 
-# Copyright: (2013-2014) Michael Till Beck <Debianguru@gmx.de>
+# Copyright: (2013-2017) Michael Till Beck <Debianguru@gmx.de>
 # License: GPL-2.0+
 
+
 import http.server
 import socketserver
 import importlib
 import sys
 import getopt
 
+
 bind = 'localhost'
 port = 8000
 configMod = 'config'
 
 
 try:
-        opts, args = getopt.getopt(sys.argv[1:], 'hc:b:p:', ['help', 'config=', 'bind=', 'port='])
+    opts, args = getopt.getopt(sys.argv[1:], 'hc:b:p:', ['help', 'config=', 'bind=', 'port='])
 except getopt.GetoptError:
-        print('Usage: FeedServer.py --config=config --port=8000')
-        sys.exit(1)
+    print('Usage: FeedServer.py --config=config --port=8000 --bind=localhost')
+    sys.exit(1)
 
 for opt, arg in opts:
-        if opt == '-h':
-                print('Usage: FeedServer.py --config=config --bind=localhost --port=8000')
-                exit()
-        elif opt in ('-c', '--config'):
-                configMod = arg
-        elif opt in ('-b', '--bind'):
-                bind = arg
-        elif opt in ('-p', '--port'):
-                port = int(arg)
+    if opt == '-h':
+        print('Usage: FeedServer.py --config=config --bind=localhost --port=8000')
+        exit()
+    elif opt in ('-c', '--config'):
+        configMod = arg
+    elif opt in ('-b', '--bind'):
+        bind = arg
+    elif opt in ('-p', '--port'):
+        port = int(arg)
 
 config = importlib.import_module(configMod)
 
diff --git a/mwctools.py b/mwctools.py
new file mode 100755
index 0000000..cefbbf0
--- /dev/null
+++ b/mwctools.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright: (2013-2017) Michael Till Beck <Debianguru@gmx.de>
+# License: GPL-2.0+
+
+
+import urllib.request
+import urllib.error
+import urllib.parse
+import subprocess
+
+from lxml import etree
+from cssselect import GenericTranslator
+import re
+
+
+# Attributes in HTML files storing URI values. These values are automatically translated to absolute URIs.
+uriAttributes = [['//img[@src]', 'src'], ['//a[@href]', 'href']]
+
+maxTitleLength = 150
+
+
+class Parser:
+    # input: [Content], output: [Content]
+    def performAction(self, contentList):
+        pass
+
+
+class Receiver(Parser):
+    def __init__(self, uri):
+        self.uri = uri
+
+
+class Content:
+    def __init__(self, uri, encoding, title, content, contenttype):
+        self.uri = uri
+        self.encoding = encoding
+        self.title = title
+        self.content = content
+        self.contenttype = contenttype
+
+
+# returns a short subject line
+def getSubject(textContent):
+    global maxTitleLength
+    
+    if textContent is None or len(textContent.strip()) == 0:
+        return 'Website has been updated'
+    textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip()
+    return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent
+
+
+# translates all relative URIs found in trees to absolute URIs
+def toAbsoluteURIs(trees, baseuri):
+    global uriAttributes
+
+    for tree in trees:
+        if isinstance(tree, str):
+            continue
+        for uriAttribute in uriAttributes:
+            tags = tree.xpath(uriAttribute[0])
+            for tag in tags:
+                if tag.attrib.get(uriAttribute[1]) is not None:
+                    if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '':
+                        tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]])
+
+
+class URLReceiver(Receiver):
+    def __init__(self, uri, contenttype='html', encoding='utf-8', userAgent=None, accept=None):
+        super().__init__(uri)
+        self.contenttype = contenttype
+        self.encoding = encoding
+        self.userAgent = userAgent
+        self.accept = accept
+
+    # input: [Content], output: [Content]
+    def performAction(self, contentList=None):
+        if contentList is None:
+            contentList = []
+        
+        # open website
+        req = urllib.request.Request(self.uri)
+        if self.userAgent is not None:
+            req.add_header('User-Agent', self.userAgent)
+        if self.accept is not None:
+            req.add_header('Accept', self.accept)
+
+        with urllib.request.urlopen(req) as thefile:
+            filecontent = thefile.read().decode(self.encoding, errors='ignore')
+            contentList.append(Content(uri=self.uri, encoding=self.encoding, title=None, content=filecontent, contenttype=self.contenttype))
+
+        return contentList
+
+
+class CommandReceiver(Receiver):
+    def __init__(self, command, contenttype='text', encoding='utf-8'):
+        super().__init__(command)
+        self.encoding = encoding
+        self.command = command
+        self.contenttype = contenttype
+
+    # input: [Content], output: [Content]
+    def performAction(self, contentList=None):
+        if contentList is None:
+            contentList = []
+
+        # run command and retrieve output
+        process = subprocess.Popen(self.command, stdout=subprocess.PIPE, shell=True, close_fds=True)
+        thefile = process.stdout
+        result = thefile.read().decode(self.encoding, errors='ignore')
+        thefile.close()
+
+        if process.wait() != 0:
+            raise Exception("process terminated with an error")
+
+        contentList.append(Content(uri=None, encoding=self.encoding, title=None, content=result, contenttype=self.contenttype))
+        return contentList
+
+
+class XPathParser(Parser):
+    def __init__(self, contentxpath, titlexpath=None):
+        self.contentxpath = contentxpath
+        self.titlexpath = titlexpath
+
+    # input: [Content], output: [Content]
+    def performAction(self, contentList):
+        result = []
+        for content in contentList:
+            result.extend(self.parseOneObject(content))
+        return result
+
+    # input: Content, output: [Content]
+    def parseOneObject(self, content):
+        baseuri = content.uri
+        if content.contenttype == 'html':
+            parser = etree.HTMLParser(encoding=content.encoding)
+        else:
+            parser = etree.XMLParser(recover=True, encoding=content.encoding)
+
+        tree = etree.fromstring(content.content, parser=parser)
+
+        # xpath
+        contentresult = [] if self.contentxpath is None else tree.xpath(self.contentxpath)
+        titleresult = [] if self.titlexpath is None else tree.xpath(self.titlexpath)
+
+        # translate relative URIs to absolute URIs
+        if content.contenttype == 'html':
+            basetaglist = tree.xpath('/html/head/base')
+            if len(basetaglist) != 0:
+                baseuri = basetaglist[0].attrib['href']
+            if len(contentresult) != 0:
+                toAbsoluteURIs(contentresult, baseuri)
+            if len(titleresult) != 0:
+                toAbsoluteURIs(titleresult, baseuri)
+
+        if self.contentxpath and len(contentresult) == 0:
+            raise Exception('WARNING: content selector became invalid!')
+        if self.titlexpath and len(titleresult) == 0:
+            raise Exception('WARNING: title selector became invalid!')
+
+        contents = []
+        titles = []
+        if isinstance(contentresult, str):
+            contents = [contentresult]
+        else:
+            if len(contentresult) == 0:
+                contentresult = titleresult
+            contents = [etree.tostring(s, encoding=content.encoding, pretty_print=True).decode(content.encoding, errors='ignore') for s in contentresult]
+
+        if isinstance(titleresult, str):
+            titles = [getSubject(titleresult)]*len(contents)
+        else:
+            if len(titleresult) == 0 or len(titleresult) != len(contentresult):
+                titleresult = contentresult
+            titles = [getSubject(etree.tostring(s, method='text', encoding=content.encoding).decode(content.encoding, errors='ignore')) for s in titleresult]
+
+        result = []
+        for i in range(0, len(contents)):
+            result.append(Content(uri=content.uri, encoding=content.encoding, title=titles[i], content=contents[i], contenttype=content.contenttype))
+
+        return result
+
+
+class CSSParser(Parser):
+    def __init__(self, contentcss, titlecss=None):
+        contentxpath = GenericTranslator().css_to_xpath(contentcss)
+        titlexpath = None
+        if titlecss is not None:
+            titlexpath = GenericTranslator().css_to_xpath(titlecss)
+
+        self.xpathparser = XPathParser(contentxpath=contentxpath, titlexpath=titlexpath)
+
+    # input: [Content], output: [Content]
+    def performAction(self, contentList):
+        return self.xpathparser.performAction(contentList)
+
+
+class RegExParser(Parser):
+    def __init__(self, contentregex, titleregex=None):
+        self.contentregex = contentregex
+        self.titleregex = titleregex
+
+    # input: [Content], output: [Content]
+    def performAction(self, contentList):
+        result = []
+        for content in contentList:
+            result.extend(self.parseOneObject(content))
+        return result
+
+    # input: Content, output: [Content]
+    def parseOneObject(self, content):
+        contents = []
+        titles = []
+        if self.contentregex is not None:
+            for c in re.findall(r'' + self.contentregex, content.content, re.M):
+                if len(c.strip()) != 0:
+                    contents.append(c)
+        if self.titleregex is not None:
+            for c in re.findall(r'' + self.titleregex, content.title, re.M):
+                if len(c.strip()) != 0:
+                    titles.append(c)
+
+        if self.contentregex is not None and len(contents) == 0:
+            raise Exception('WARNING: content regex became invalid!')
+        elif self.titleregex is not None and len(titles) == 0:
+            raise Exception('WARNING: title regex became invalid!')
+        else:
+            if len(contents) == 0:
+                contents = titles
+            if len(titles) == 0 or len(titles) != len(contents):
+                titles = [getSubject(c) for c in contents]
+
+        result = []
+        for i in range(0, len(contents)):
+            result.append(Content(uri=content.uri, encoding=content.encoding, title=titles[i], content=contents[i], contenttype=content.contenttype))
+
+        return result
+
author	Jörg Frings-Fürst <debian@jff-webhosting.net>	2017-10-01 18:50:17 +0200
committer	Jörg Frings-Fürst <debian@jff-webhosting.net>	2017-10-01 18:50:17 +0200
commit	04b13e003d6af0de21e6c59e411ffee5b97b6134 (patch)
tree	6ea896c823290df0c598f82f8daa61713de373f5
parent	f8f939634396158de53fb26fa7f9a539a92fb219 (diff)