MailWebsiteChanges Feed

From b329a335ad10648e64b3de049c083bc04638a072 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Fri, 19 Sep 2014 08:48:17 +0200 Subject: Imported Upstream version 1.7.2 --- mwc.py | 345 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100755 mwc.py (limited to 'mwc.py') diff --git a/mwc.py b/mwc.py new file mode 100755 index 0000000..5d7278e --- /dev/null +++ b/mwc.py @@ -0,0 +1,345 @@ +#!/usr/bin/python3 + +# Copyright: (2013-2014) Michael Till Beck +# License: GPL-2.0+ + +import urllib.request, urllib.error, urllib.parse +import urllib.parse +from lxml import etree +from cssselect import GenericTranslator +import re +import io + +import smtplib +from email.mime.text import MIMEText +from email.header import Header +from urllib.parse import urljoin + +import os +import sys +import getopt +import traceback + +import subprocess + +import time +from time import strftime +import random + +import importlib +config = None + +defaultEncoding = 'utf-8' +maxTitleLength = 150 + +# this is how an empty feed looks like +emptyfeed = """ + + + MailWebsiteChanges Feed + https://github.com/Debianguru/MailWebsiteChanges + MailWebsiteChanges Feed + +""" + +# Attributes in HTML files storing URI values. These values are automatically translated to absolute URIs. +uriAttributes = [['//img[@src]', 'src'], ['//a[@href]', 'href']] +cmdscheme = 'cmd://' + +mailsession = None + + +# translates all relative URIs found in trees to absolute URIs +def toAbsoluteURIs(trees, baseuri): + for tree in trees: + for uriAttribute in uriAttributes: + tags = tree.xpath(uriAttribute[0]) + for tag in tags: + if tag.attrib.get(uriAttribute[1]) != None: + if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '': + tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]]) + + +def parseSite(site): + file, content, titles, warning = None, None, None, None + + uri = site['uri'] + contenttype = site.get('type', 'html') + contentregex = site.get('contentregex', '') + titleregex = site.get('titleregex', '') + enc = site.get('encoding', defaultEncoding) + + contentxpath = site.get('contentxpath', '') + if contentxpath == '' and site.get('contentcss', '') != '': + # CSS + contentxpath = GenericTranslator().css_to_xpath(site.get('contentcss')) + titlexpath = site.get('titlexpath', '') + if titlexpath == '' and site.get('titlecss', '') != '': + titlexpath = GenericTranslator().css_to_xpath(site.get('titlecss')) + + try: + + if uri.startswith(cmdscheme): + # run command and retrieve output + process = subprocess.Popen(uri[len(cmdscheme):], stdout=subprocess.PIPE, shell=True, close_fds=True) + file = process.stdout + else: + # open website + file = urllib.request.urlopen(uri) + + + if contenttype == 'text' or (contentxpath == '' and titlexpath == ''): + contents = [file.read().decode(enc)] + titles = [] + else: + baseuri = uri + if contenttype == 'html': + parser = etree.HTMLParser(encoding=enc) + else: + parser = etree.XMLParser(recover=True, encoding=enc) + + tree = etree.parse(file, parser) + + # xpath + contentresult = tree.xpath(contentxpath) if contentxpath else [] + titleresult = tree.xpath(titlexpath) if titlexpath else [] + + # translate relative URIs to absolute URIs + if contenttype == 'html': + basetaglist = tree.xpath('/html/head/base') + if len(basetaglist) != 0: + baseuri = basetaglist[0].attrib['href'] + if len(contentresult) != 0: + toAbsoluteURIs(contentresult, baseuri) + if len(titleresult) != 0: + toAbsoluteURIs(titleresult, baseuri) + + if contentxpath != '' and titlexpath != '' and len(contentresult) != len(titleresult): + warning = 'WARNING: number of title blocks (' + str(len(titleresult)) + ') does not match number of content blocks (' + str(len(contentresult)) + ')' + elif contentxpath and len(contentresult) == 0: + warning = 'WARNING: content selector became invalid!' + elif titlexpath and len(titleresult) == 0: + warning = 'WARNING: title selector became invalid!' + else: + if len(contentresult) == 0: + contentresult = titleresult + if len(titleresult) == 0: + titleresult = contentresult + + contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult] + titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult] + + except IOError as e: + warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e) + + if file is not None: + file.close() + + if uri.startswith(cmdscheme) and process.wait() != 0: + warning = 'WARNING: process terminated with an error' + + if warning: + return {'content': content, 'titles': titles, 'warning': warning} + + # parse regex + if contentregex: + contents = [x for y in [re.findall(r'' + contentregex, c, re.S) for c in contents] for x in y] + if titleregex: + titles = [x for y in [re.findall(r'' + titleregex, c, re.S) for c in titles] for x in y] + + if contentregex and titleregex and len(contents) != len(titles): + warning = 'WARNING: number of title blocks (' + str(len(titles)) + ') does not match number of content blocks (' + str(len(contents)) + ') after regex' + elif contentregex and len(contents) == 0: + warning = 'WARNING: content regex became invalid!' + elif titleregex and len(titles) == 0: + warning = 'WARNING: title regex became invalid!' + else: + if len(contents) == 0: + contents = titles + if len(titles) == 0: + titles = [getSubject(c) for c in contents] + + return {'contents': contents, 'titles': titles, 'warning': warning} + + +# returns a short subject line +def getSubject(textContent): + if textContent == None or textContent == '': + return config.subjectPostfix + textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip() + return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent + + +# generates a new RSS feed item +def genFeedItem(subject, content, link, change): + feeditem = etree.Element('item') + titleitem = etree.Element('title') + titleitem.text = subject + ' #' + str(change) + feeditem.append(titleitem) + linkitem = etree.Element('link') + linkitem.text = link + feeditem.append(linkitem) + descriptionitem = etree.Element('description') + descriptionitem.text = content + feeditem.append(descriptionitem) + guiditem = etree.Element('guid') + guiditem.text = str(random.getrandbits(32)) + feeditem.append(guiditem) + dateitem = etree.Element('pubDate') + dateitem.text = strftime("%a, %d %b %Y %H:%M:%S %Z", time.localtime()) + feeditem.append(dateitem) + + return feeditem + + +# sends mail notification +def sendmail(receiver, subject, content, sendAsHtml, link): + global mailsession + + if sendAsHtml: + baseurl = None + if link != None: + content = '

' + subject + '

\n' + content + baseurl = urljoin(link, '/') + mail = MIMEText('' + subject + '' + ('' if baseurl else '') + '' + content + '', 'html', defaultEncoding) + else: + if link != None: + content = link + '\n\n' + content + mail = MIMEText(content, 'text', defaultEncoding) + + mail['From'] = config.sender + mail['To'] = receiver + mail['Subject'] = Header(subject, defaultEncoding) + + # initialize session once, not each time this method gets called + if mailsession is None: + mailsession = smtplib.SMTP(config.smtphost, config.smtpport) + if config.useTLS: + mailsession.ehlo() + mailsession.starttls() + mailsession.login(config.smtpusername, config.smtppwd) + + mailsession.sendmail(config.sender, receiver.split(','), mail.as_string()) + + +# returns a list of all content that is stored locally for a specific site +def getFileContents(shortname): + result = [] + for f in os.listdir('.'): + if f.startswith(shortname + '.') and f.endswith('.txt'): + file = open(f, 'r') + result.append(file.read()) + file.close() + return result + + +# updates list of content that is stored locally for a specific site +def storeFileContents(shortname, parseResult): + for f in os.listdir('.'): + if f.startswith(shortname + '.') and f.endswith('.txt'): + os.remove(f) + + i = 0 + for c in parseResult['contents']: + file = open(shortname + '.' + str(i) + '.txt', 'w') + file.write(c) + file.close() + i += 1 + + +def pollWebsites(): + + # parse existing feed or create a new one + if config.enableRSSFeed: + if os.path.isfile(config.rssfile): + feedXML = etree.parse(config.rssfile) + else: + feedXML = etree.parse(io.StringIO(emptyfeed)) + + # start polling sites + for site in config.sites: + + print('polling site [' + site['shortname'] + '] ...') + parseResult = parseSite(site) + receiver = site.get('receiver', config.receiver) + + # if something went wrong, notify the user + if parseResult['warning']: + subject = '[' + site['shortname'] + '] WARNING' + print('WARNING: ' + parseResult['warning']) + if config.enableMailNotifications: + sendmail(receiver, subject, parseResult['warning'], False, None) + if config.enableRSSFeed: + feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0)) + else: + # otherwise, check which parts of the site were updated + changes = 0 + fileContents = getFileContents(site['shortname']) + i = 0 + for content in parseResult['contents']: + if content not in fileContents: + changes += 1 + + subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i] + print(' ' + subject) + if config.enableMailNotifications and len(fileContents) > 0: + sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri']) + + if config.enableRSSFeed: + feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes)) + i += 1 + + + if changes > 0: + storeFileContents(site['shortname'], parseResult) + print(' ' + str(changes) + ' updates') + + # store feed + if config.enableRSSFeed: + for o in feedXML.xpath('//channel/item[position()