MailWebsiteChanges Feed

From 04b13e003d6af0de21e6c59e411ffee5b97b6134 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Sun, 1 Oct 2017 18:50:17 +0200 Subject: New upstream version 2.0.4 --- mwc.py | 298 ++++++++++++++++++----------------------------------------------- 1 file changed, 82 insertions(+), 216 deletions(-) (limited to 'mwc.py') diff --git a/mwc.py b/mwc.py index 4df4799..6a48317 100755 --- a/mwc.py +++ b/mwc.py @@ -1,15 +1,11 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright: (2013-2015) Michael Till Beck +# Copyright: (2013-2017) Michael Till Beck # License: GPL-2.0+ -import urllib.request, urllib.error, urllib.parse -import urllib.parse -from lxml import etree -from cssselect import GenericTranslator -import re import io +from lxml import etree import hashlib import smtplib @@ -22,8 +18,6 @@ import sys import getopt import traceback -import subprocess - import time from time import strftime import random @@ -32,164 +26,20 @@ import importlib config = None defaultEncoding = 'utf-8' -maxTitleLength = 150 # this is how an empty RSS feed looks like emptyfeed = """ MailWebsiteChanges Feed - https://github.com/Debianguru/MailWebsiteChanges + https://github.com/mtill/MailWebsiteChanges MailWebsiteChanges Feed """ -# Attributes in HTML files storing URI values. These values are automatically translated to absolute URIs. -uriAttributes = [['//img[@src]', 'src'], ['//a[@href]', 'href']] -cmdscheme = 'cmd://' - mailsession = None -# translates all relative URIs found in trees to absolute URIs -def toAbsoluteURIs(trees, baseuri): - for tree in trees: - if isinstance(tree, str): - continue - for uriAttribute in uriAttributes: - tags = tree.xpath(uriAttribute[0]) - for tag in tags: - if tag.attrib.get(uriAttribute[1]) != None: - if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '': - tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]]) - - -def parseSite(site): - global defaultEncoding - file, content, titles, warning = None, None, None, None - - uri = site['uri'] - contenttype = site.get('type', 'html') - contentregex = site.get('contentregex', '') - titleregex = site.get('titleregex', '') - splitregex = site.get('splitregex', '') - enc = site.get('encoding', defaultEncoding) - - contentxpath = site.get('contentxpath', '') - if contentxpath == '' and site.get('contentcss', '') != '': - # CSS - contentxpath = GenericTranslator().css_to_xpath(site.get('contentcss')) - titlexpath = site.get('titlexpath', '') - if titlexpath == '' and site.get('titlecss', '') != '': - titlexpath = GenericTranslator().css_to_xpath(site.get('titlecss')) - - try: - - if uri.startswith(cmdscheme): - # run command and retrieve output - process = subprocess.Popen(uri[len(cmdscheme):], stdout=subprocess.PIPE, shell=True, close_fds=True) - file = process.stdout - else: - # open website - req = urllib.request.Request(uri) - if 'user-agent' in site: - req.add_header('User-Agent', site['user-agent']) - if 'accept' in site: - req.add_header('Accept', site['accept']) - file = urllib.request.urlopen(req) - - - if contenttype == 'text' or (contentxpath == '' and titlexpath == ''): - thefullcontent = file.read().decode(enc, errors='ignore') - contents = [thefullcontent] - if splitregex != '': - contents = thefullcontent.split(splitregex) - titles = [] - else: - baseuri = uri - if contenttype == 'html': - parser = etree.HTMLParser(encoding=enc) - else: - parser = etree.XMLParser(recover=True, encoding=enc) - - tree = etree.parse(file, parser) - - # xpath - contentresult = tree.xpath(contentxpath) if contentxpath else [] - titleresult = tree.xpath(titlexpath) if titlexpath else [] - - # translate relative URIs to absolute URIs - if contenttype == 'html': - basetaglist = tree.xpath('/html/head/base') - if len(basetaglist) != 0: - baseuri = basetaglist[0].attrib['href'] - if len(contentresult) != 0: - toAbsoluteURIs(contentresult, baseuri) - if len(titleresult) != 0: - toAbsoluteURIs(titleresult, baseuri) - - if contentxpath != '' and titlexpath != '' and len(contentresult) != len(titleresult): - warning = 'WARNING: number of title blocks (' + str(len(titleresult)) + ') does not match number of content blocks (' + str(len(contentresult)) + ')' - elif contentxpath and len(contentresult) == 0: - warning = 'WARNING: content selector became invalid!' - elif titlexpath and len(titleresult) == 0: - warning = 'WARNING: title selector became invalid!' - else: - if len(contentresult) == 0: - contentresult = titleresult - if len(titleresult) == 0: - titleresult = contentresult - - if isinstance(contentresult, str): - contents = [contentresult] - else: - contents = [etree.tostring(s, encoding=enc, pretty_print=True).decode(enc, errors='ignore') for s in contentresult] - if isinstance(titleresult, str): - titles = [getSubject(titleresult)] - else: - titles = [getSubject(etree.tostring(s, method='text', encoding=enc).decode(enc, errors='ignore')) for s in titleresult] - - except IOError as e: - warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e) - - if file is not None: - file.close() - - if uri.startswith(cmdscheme) and process.wait() != 0: - warning = 'WARNING: process terminated with an error' - - if warning: - return {'content': content, 'titles': titles, 'warning': warning} - - # parse regex - if contentregex: - contents = [x for y in [re.findall(r'' + contentregex, c, re.S) for c in contents] for x in y] - if titleregex: - titles = [x for y in [re.findall(r'' + titleregex, c, re.S) for c in titles] for x in y] - - if contentregex and titleregex and len(contents) != len(titles): - warning = 'WARNING: number of title blocks (' + str(len(titles)) + ') does not match number of content blocks (' + str(len(contents)) + ') after regex' - elif contentregex and len(contents) == 0: - warning = 'WARNING: content regex became invalid!' - elif titleregex and len(titles) == 0: - warning = 'WARNING: title regex became invalid!' - else: - if len(contents) == 0: - contents = titles - if len(titles) == 0: - titles = [getSubject(c) for c in contents] - - return {'contents': contents, 'titles': titles, 'warning': warning} - - -# returns a short subject line -def getSubject(textContent): - if textContent == None or textContent == '': - return config.subjectPostfix - textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip() - return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent - - # generates a new RSS feed item def genFeedItem(subject, content, link, change): feeditem = etree.Element('item') @@ -213,23 +63,26 @@ def genFeedItem(subject, content, link, change): # sends mail notification -def sendmail(receiver, subject, content, sendAsHtml, link): +def sendmail(receiver, subject, content, sendAsHtml, link, encoding=None): global mailsession, defaultEncoding + if encoding is None: + encoding = defaultEncoding + if sendAsHtml: baseurl = None - if link != None: + if link is not None: content = '

' + subject + '

\n' + content baseurl = urljoin(link, '/') - mail = MIMEText('' + subject + '' + ('' if baseurl else '') + '' + content + '', 'html', defaultEncoding) + mail = MIMEText('' + subject + '' + ('' if baseurl else '') + '' + content + '', 'html', encoding) else: - if link != None: + if link is not None: content = link + '\n\n' + content - mail = MIMEText(content, 'text', defaultEncoding) + mail = MIMEText(content, 'text', encoding) mail['From'] = config.sender mail['To'] = receiver - mail['Subject'] = Header(subject, defaultEncoding) + mail['Subject'] = Header(subject, encoding) # initialize session once, not each time this method gets called if mailsession is None: @@ -244,103 +97,114 @@ def sendmail(receiver, subject, content, sendAsHtml, link): # returns a list of all content that is stored locally for a specific site -def getStoredHashes(shortname): - +def getStoredHashes(name): result = [] - filename = shortname + ".txt" + filename = os.path.join(config.workingDirectory, name + ".txt") if os.path.exists(filename): - with open(filename, 'r') as file: - for line in file: + with open(filename, 'r') as thefile: + for line in thefile: result.append(line.rstrip()) return result # updates list of content that is stored locally for a specific site -def storeHashes(shortname, contentHashes): - - with open(shortname + '.txt', 'w') as file: +def storeHashes(name, contentHashes): + with open(os.path.join(config.workingDirectory, name + '.txt'), 'w') as thefile: for h in contentHashes: - file.write(h + "\n") + thefile.write(h + "\n") + + +def runParsers(parsers, contentList=None): + if contentList is None: + contentList = [] + + for parser in parsers: + contentList = parser.performAction(contentList) + + return contentList def pollWebsites(): global defaultEncoding # parse existing feed or create a new one + rssfile = config.rssfile + if not os.path.isabs(rssfile): + rssfile = os.path.join(config.workingDirectory, rssfile) + if config.enableRSSFeed: - if os.path.isfile(config.rssfile): - feedXML = etree.parse(config.rssfile) + if os.path.isfile(rssfile): + feedXML = etree.parse(rssfile) else: feedXML = etree.parse(io.StringIO(emptyfeed)) # start polling sites mailsSent = 0 for site in config.sites: - print('polling site [' + site['shortname'] + '] ...') - sessionHashes = [] - parseResult = parseSite(site) + print('polling site [' + site['name'] + '] ...') receiver = site.get('receiver', config.receiver) - # if something went wrong, notify the user - if parseResult['warning']: - subject = '[' + site['shortname'] + '] WARNING' - print('WARNING: ' + parseResult['warning']) + try: + contentList = runParsers(site['parsers']) + except Exception as e: + # if something went wrong, notify the user + subject = '[' + site['name'] + '] WARNING' + print('WARNING: ' + str(e)) if config.enableMailNotifications: if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession: - sendmail(receiver, subject, parseResult['warning'], False, None) + sendmail(receiver=receiver, subject=subject, content=str(e), sendAsHtml=False, link=None) mailsSent = mailsSent + 1 if config.enableRSSFeed: - feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0)) - else: - # otherwise, check which parts of the site were updated - changes = 0 - fileHashes = getStoredHashes(site['shortname']) - i = 0 - for content in parseResult['contents']: - - contenthash = hashlib.md5(content.encode(defaultEncoding)).hexdigest() - if contenthash not in fileHashes: - if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession: - changes += 1 - sessionHashes.append(contenthash) - - subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i] - print(' ' + subject) - if config.enableMailNotifications and len(fileHashes) > 0: - sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri']) - mailsSent = mailsSent + 1 - - if config.enableRSSFeed: - feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes)) - else: + feedXML.xpath('//channel')[0].append(genFeedItem(subject, str(e), "", 0)) + continue + + sessionHashes = [] + changedContents = [] + fileHashes = getStoredHashes(site['name']) + for content in contentList: + + contenthash = hashlib.md5(content.content.encode(content.encoding)).hexdigest() + if contenthash not in fileHashes: + if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession: sessionHashes.append(contenthash) + changedContents.append(content) - i += 1 + subject = '[' + site['name'] + '] ' + content.title + print(' ' + subject) + if config.enableMailNotifications and len(fileHashes) > 0: + sendAsHtml = (content.contenttype == 'html') + sendmail(receiver=receiver, subject=subject, content=content.content, sendAsHtml=sendAsHtml, link=content.uri, encoding=content.encoding) + mailsSent = mailsSent + 1 + if config.enableRSSFeed: + feedXML.xpath('//channel')[0].append(genFeedItem(subject, content.content, content.uri, len(changedContents))) + else: + sessionHashes.append(contenthash) + + if 'postRun' in site: + runParsers(site['postRun'], changedContents) - if changes > 0: - storeHashes(site['shortname'], sessionHashes) - print(' ' + str(changes) + ' updates') + if len(changedContents) > 0: + storeHashes(site['name'], sessionHashes) + print(' ' + str(len(changedContents)) + ' updates') # store feed if config.enableRSSFeed: for o in feedXML.xpath('//channel/item[position()