New upstream version 2.0.4upstream/2.0.4

author: Jörg Frings-Fürst <debian@jff-webhosting.net> 2017-10-01 18:50:17 +0200
committer: Jörg Frings-Fürst <debian@jff-webhosting.net> 2017-10-01 18:50:17 +0200
commit: 04b13e003d6af0de21e6c59e411ffee5b97b6134 (patch)
tree: 6ea896c823290df0c598f82f8daa61713de373f5 /mwctools.py
parent: f8f939634396158de53fb26fa7f9a539a92fb219 (diff)
1 files changed, 239 insertions, 0 deletions
diff --git a/mwctools.py b/mwctools.py
new file mode 100755
index 0000000..cefbbf0
--- /dev/null
+++ b/mwctools.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright: (2013-2017) Michael Till Beck <Debianguru@gmx.de>
+# License: GPL-2.0+
+
+
+import urllib.request
+import urllib.error
+import urllib.parse
+import subprocess
+
+from lxml import etree
+from cssselect import GenericTranslator
+import re
+
+
+# Attributes in HTML files storing URI values. These values are automatically translated to absolute URIs.
+uriAttributes = [['//img[@src]', 'src'], ['//a[@href]', 'href']]
+
+maxTitleLength = 150
+
+
+class Parser:
+    # input: [Content], output: [Content]
+    def performAction(self, contentList):
+        pass
+
+
+class Receiver(Parser):
+    def __init__(self, uri):
+        self.uri = uri
+
+
+class Content:
+    def __init__(self, uri, encoding, title, content, contenttype):
+        self.uri = uri
+        self.encoding = encoding
+        self.title = title
+        self.content = content
+        self.contenttype = contenttype
+
+
+# returns a short subject line
+def getSubject(textContent):
+    global maxTitleLength
+    
+    if textContent is None or len(textContent.strip()) == 0:
+        return 'Website has been updated'
+    textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip()
+    return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent
+
+
+# translates all relative URIs found in trees to absolute URIs
+def toAbsoluteURIs(trees, baseuri):
+    global uriAttributes
+
+    for tree in trees:
+        if isinstance(tree, str):
+            continue
+        for uriAttribute in uriAttributes:
+            tags = tree.xpath(uriAttribute[0])
+            for tag in tags:
+                if tag.attrib.get(uriAttribute[1]) is not None:
+                    if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '':
+                        tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]])
+
+
+class URLReceiver(Receiver):
+    def __init__(self, uri, contenttype='html', encoding='utf-8', userAgent=None, accept=None):
+        super().__init__(uri)
+        self.contenttype = contenttype
+        self.encoding = encoding
+        self.userAgent = userAgent
+        self.accept = accept
+
+    # input: [Content], output: [Content]
+    def performAction(self, contentList=None):
+        if contentList is None:
+            contentList = []
+        
+        # open website
+        req = urllib.request.Request(self.uri)
+        if self.userAgent is not None:
+            req.add_header('User-Agent', self.userAgent)
+        if self.accept is not None:
+            req.add_header('Accept', self.accept)
+
+        with urllib.request.urlopen(req) as thefile:
+            filecontent = thefile.read().decode(self.encoding, errors='ignore')
+            contentList.append(Content(uri=self.uri, encoding=self.encoding, title=None, content=filecontent, contenttype=self.contenttype))
+
+        return contentList
+
+
+class CommandReceiver(Receiver):
+    def __init__(self, command, contenttype='text', encoding='utf-8'):
+        super().__init__(command)
+        self.encoding = encoding
+        self.command = command
+        self.contenttype = contenttype
+
+    # input: [Content], output: [Content]
+    def performAction(self, contentList=None):
+        if contentList is None:
+            contentList = []
+
+        # run command and retrieve output
+        process = subprocess.Popen(self.command, stdout=subprocess.PIPE, shell=True, close_fds=True)
+        thefile = process.stdout
+        result = thefile.read().decode(self.encoding, errors='ignore')
+        thefile.close()
+
+        if process.wait() != 0:
+            raise Exception("process terminated with an error")
+
+        contentList.append(Content(uri=None, encoding=self.encoding, title=None, content=result, contenttype=self.contenttype))
+        return contentList
+
+
+class XPathParser(Parser):
+    def __init__(self, contentxpath, titlexpath=None):
+        self.contentxpath = contentxpath
+        self.titlexpath = titlexpath
+
+    # input: [Content], output: [Content]
+    def performAction(self, contentList):
+        result = []
+        for content in contentList:
+            result.extend(self.parseOneObject(content))
+        return result
+
+    # input: Content, output: [Content]
+    def parseOneObject(self, content):
+        baseuri = content.uri
+        if content.contenttype == 'html':
+            parser = etree.HTMLParser(encoding=content.encoding)
+        else:
+            parser = etree.XMLParser(recover=True, encoding=content.encoding)
+
+        tree = etree.fromstring(content.content, parser=parser)
+
+        # xpath
+        contentresult = [] if self.contentxpath is None else tree.xpath(self.contentxpath)
+        titleresult = [] if self.titlexpath is None else tree.xpath(self.titlexpath)
+
+        # translate relative URIs to absolute URIs
+        if content.contenttype == 'html':
+            basetaglist = tree.xpath('/html/head/base')
+            if len(basetaglist) != 0:
+                baseuri = basetaglist[0].attrib['href']
+            if len(contentresult) != 0:
+                toAbsoluteURIs(contentresult, baseuri)
+            if len(titleresult) != 0:
+                toAbsoluteURIs(titleresult, baseuri)
+
+        if self.contentxpath and len(contentresult) == 0:
+            raise Exception('WARNING: content selector became invalid!')
+        if self.titlexpath and len(titleresult) == 0:
+            raise Exception('WARNING: title selector became invalid!')
+
+        contents = []
+        titles = []
+        if isinstance(contentresult, str):
+            contents = [contentresult]
+        else:
+            if len(contentresult) == 0:
+                contentresult = titleresult
+            contents = [etree.tostring(s, encoding=content.encoding, pretty_print=True).decode(content.encoding, errors='ignore') for s in contentresult]
+
+        if isinstance(titleresult, str):
+            titles = [getSubject(titleresult)]*len(contents)
+        else:
+            if len(titleresult) == 0 or len(titleresult) != len(contentresult):
+                titleresult = contentresult
+            titles = [getSubject(etree.tostring(s, method='text', encoding=content.encoding).decode(content.encoding, errors='ignore')) for s in titleresult]
+
+        result = []
+        for i in range(0, len(contents)):
+            result.append(Content(uri=content.uri, encoding=content.encoding, title=titles[i], content=contents[i], contenttype=content.contenttype))
+
+        return result
+
+
+class CSSParser(Parser):
+    def __init__(self, contentcss, titlecss=None):
+        contentxpath = GenericTranslator().css_to_xpath(contentcss)
+        titlexpath = None
+        if titlecss is not None:
+            titlexpath = GenericTranslator().css_to_xpath(titlecss)
+
+        self.xpathparser = XPathParser(contentxpath=contentxpath, titlexpath=titlexpath)
+
+    # input: [Content], output: [Content]
+    def performAction(self, contentList):
+        return self.xpathparser.performAction(contentList)
+
+
+class RegExParser(Parser):
+    def __init__(self, contentregex, titleregex=None):
+        self.contentregex = contentregex
+        self.titleregex = titleregex
+
+    # input: [Content], output: [Content]
+    def performAction(self, contentList):
+        result = []
+        for content in contentList:
+            result.extend(self.parseOneObject(content))
+        return result
+
+    # input: Content, output: [Content]
+    def parseOneObject(self, content):
+        contents = []
+        titles = []
+        if self.contentregex is not None:
+            for c in re.findall(r'' + self.contentregex, content.content, re.M):
+                if len(c.strip()) != 0:
+                    contents.append(c)
+        if self.titleregex is not None:
+            for c in re.findall(r'' + self.titleregex, content.title, re.M):
+                if len(c.strip()) != 0:
+                    titles.append(c)
+
+        if self.contentregex is not None and len(contents) == 0:
+            raise Exception('WARNING: content regex became invalid!')
+        elif self.titleregex is not None and len(titles) == 0:
+            raise Exception('WARNING: title regex became invalid!')
+        else:
+            if len(contents) == 0:
+                contents = titles
+            if len(titles) == 0 or len(titles) != len(contents):
+                titles = [getSubject(c) for c in contents]
+
+        result = []
+        for i in range(0, len(contents)):
+            result.append(Content(uri=content.uri, encoding=content.encoding, title=titles[i], content=contents[i], contenttype=content.contenttype))
+
+        return result
+
author	Jörg Frings-Fürst <debian@jff-webhosting.net>	2017-10-01 18:50:17 +0200
committer	Jörg Frings-Fürst <debian@jff-webhosting.net>	2017-10-01 18:50:17 +0200
commit	04b13e003d6af0de21e6c59e411ffee5b97b6134 (patch)
tree	6ea896c823290df0c598f82f8daa61713de373f5 /mwctools.py
parent	f8f939634396158de53fb26fa7f9a539a92fb219 (diff)