diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rwxr-xr-x | mwc.py | 574 |
2 files changed, 287 insertions, 288 deletions
@@ -4,5 +4,4 @@ /*.txt *~ *.pyc -.pc @@ -1,4 +1,5 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- # Copyright: (2013-2015) Michael Till Beck <Debianguru@gmx.de> # License: GPL-2.0+ @@ -9,6 +10,7 @@ from lxml import etree from cssselect import GenericTranslator import re import io +import hashlib import smtplib from email.mime.text import MIMEText @@ -51,324 +53,322 @@ mailsession = None # translates all relative URIs found in trees to absolute URIs def toAbsoluteURIs(trees, baseuri): - for tree in trees: - if isinstance(tree, str): - continue - for uriAttribute in uriAttributes: - tags = tree.xpath(uriAttribute[0]) - for tag in tags: - if tag.attrib.get(uriAttribute[1]) != None: - if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '': - tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]]) + for tree in trees: + if isinstance(tree, str): + continue + for uriAttribute in uriAttributes: + tags = tree.xpath(uriAttribute[0]) + for tag in tags: + if tag.attrib.get(uriAttribute[1]) != None: + if urllib.parse.urlparse(tag.attrib[uriAttribute[1]]).scheme == '': + tag.attrib[uriAttribute[1]] = urllib.parse.urljoin(baseuri, tag.attrib[uriAttribute[1]]) def parseSite(site): - file, content, titles, warning = None, None, None, None - - uri = site['uri'] - contenttype = site.get('type', 'html') - contentregex = site.get('contentregex', '') - titleregex = site.get('titleregex', '') - splitregex = site.get('splitregex', '') - enc = site.get('encoding', defaultEncoding) - - contentxpath = site.get('contentxpath', '') - if contentxpath == '' and site.get('contentcss', '') != '': - # CSS - contentxpath = GenericTranslator().css_to_xpath(site.get('contentcss')) - titlexpath = site.get('titlexpath', '') - if titlexpath == '' and site.get('titlecss', '') != '': - titlexpath = GenericTranslator().css_to_xpath(site.get('titlecss')) - - try: - - if uri.startswith(cmdscheme): - # run command and retrieve output - process = subprocess.Popen(uri[len(cmdscheme):], stdout=subprocess.PIPE, shell=True, close_fds=True) - file = process.stdout - else: - # open website - req = urllib.request.Request(uri) - if 'user-agent' in site: - req.add_header('User-Agent', site['user-agent']) - if 'accept' in site: - req.add_header('Accept', site['accept']) - file = urllib.request.urlopen(req) - - - if contenttype == 'text' or (contentxpath == '' and titlexpath == ''): - thefullcontent = file.read().decode(enc) - contents = [thefullcontent] - if splitregex != '': - contents = thefullcontent.split(splitregex) - titles = [] - else: - baseuri = uri - if contenttype == 'html': - parser = etree.HTMLParser(encoding=enc) - else: - parser = etree.XMLParser(recover=True, encoding=enc) - - tree = etree.parse(file, parser) - - # xpath - contentresult = tree.xpath(contentxpath) if contentxpath else [] - titleresult = tree.xpath(titlexpath) if titlexpath else [] - - # translate relative URIs to absolute URIs - if contenttype == 'html': - basetaglist = tree.xpath('/html/head/base') - if len(basetaglist) != 0: - baseuri = basetaglist[0].attrib['href'] - if len(contentresult) != 0: - toAbsoluteURIs(contentresult, baseuri) - if len(titleresult) != 0: - toAbsoluteURIs(titleresult, baseuri) - - if contentxpath != '' and titlexpath != '' and len(contentresult) != len(titleresult): - warning = 'WARNING: number of title blocks (' + str(len(titleresult)) + ') does not match number of content blocks (' + str(len(contentresult)) + ')' - elif contentxpath and len(contentresult) == 0: - warning = 'WARNING: content selector became invalid!' - elif titlexpath and len(titleresult) == 0: - warning = 'WARNING: title selector became invalid!' - else: - if len(contentresult) == 0: - contentresult = titleresult - if len(titleresult) == 0: - titleresult = contentresult - - if isinstance(contentresult, str): - contents = [contentresult] - else: - contents = [etree.tostring(s, encoding=defaultEncoding, pretty_print=True).decode(defaultEncoding) for s in contentresult] - if isinstance(titleresult, str): - titles = [getSubject(titleresult)] - else: - titles = [getSubject(' '.join(s.xpath('.//text()'))) for s in titleresult] - - except IOError as e: - warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e) - - if file is not None: - file.close() - - if uri.startswith(cmdscheme) and process.wait() != 0: - warning = 'WARNING: process terminated with an error' - - if warning: - return {'content': content, 'titles': titles, 'warning': warning} - - # parse regex - if contentregex: - contents = [x for y in [re.findall(r'' + contentregex, c, re.S) for c in contents] for x in y] - if titleregex: - titles = [x for y in [re.findall(r'' + titleregex, c, re.S) for c in titles] for x in y] - - if contentregex and titleregex and len(contents) != len(titles): - warning = 'WARNING: number of title blocks (' + str(len(titles)) + ') does not match number of content blocks (' + str(len(contents)) + ') after regex' - elif contentregex and len(contents) == 0: - warning = 'WARNING: content regex became invalid!' - elif titleregex and len(titles) == 0: - warning = 'WARNING: title regex became invalid!' + global defaultEncoding + file, content, titles, warning = None, None, None, None + + uri = site['uri'] + contenttype = site.get('type', 'html') + contentregex = site.get('contentregex', '') + titleregex = site.get('titleregex', '') + splitregex = site.get('splitregex', '') + enc = site.get('encoding', defaultEncoding) + + contentxpath = site.get('contentxpath', '') + if contentxpath == '' and site.get('contentcss', '') != '': + # CSS + contentxpath = GenericTranslator().css_to_xpath(site.get('contentcss')) + titlexpath = site.get('titlexpath', '') + if titlexpath == '' and site.get('titlecss', '') != '': + titlexpath = GenericTranslator().css_to_xpath(site.get('titlecss')) + + try: + + if uri.startswith(cmdscheme): + # run command and retrieve output + process = subprocess.Popen(uri[len(cmdscheme):], stdout=subprocess.PIPE, shell=True, close_fds=True) + file = process.stdout else: - if len(contents) == 0: - contents = titles - if len(titles) == 0: - titles = [getSubject(c) for c in contents] - - return {'contents': contents, 'titles': titles, 'warning': warning} + # open website + req = urllib.request.Request(uri) + if 'user-agent' in site: + req.add_header('User-Agent', site['user-agent']) + if 'accept' in site: + req.add_header('Accept', site['accept']) + file = urllib.request.urlopen(req) + + + if contenttype == 'text' or (contentxpath == '' and titlexpath == ''): + thefullcontent = file.read().decode(enc, errors='ignore') + contents = [thefullcontent] + if splitregex != '': + contents = thefullcontent.split(splitregex) + titles = [] + else: + baseuri = uri + if contenttype == 'html': + parser = etree.HTMLParser(encoding=enc) + else: + parser = etree.XMLParser(recover=True, encoding=enc) + + tree = etree.parse(file, parser) + + # xpath + contentresult = tree.xpath(contentxpath) if contentxpath else [] + titleresult = tree.xpath(titlexpath) if titlexpath else [] + + # translate relative URIs to absolute URIs + if contenttype == 'html': + basetaglist = tree.xpath('/html/head/base') + if len(basetaglist) != 0: + baseuri = basetaglist[0].attrib['href'] + if len(contentresult) != 0: + toAbsoluteURIs(contentresult, baseuri) + if len(titleresult) != 0: + toAbsoluteURIs(titleresult, baseuri) + + if contentxpath != '' and titlexpath != '' and len(contentresult) != len(titleresult): + warning = 'WARNING: number of title blocks (' + str(len(titleresult)) + ') does not match number of content blocks (' + str(len(contentresult)) + ')' + elif contentxpath and len(contentresult) == 0: + warning = 'WARNING: content selector became invalid!' + elif titlexpath and len(titleresult) == 0: + warning = 'WARNING: title selector became invalid!' + else: + if len(contentresult) == 0: + contentresult = titleresult + if len(titleresult) == 0: + titleresult = contentresult + + if isinstance(contentresult, str): + contents = [contentresult] + else: + contents = [etree.tostring(s, encoding=enc, pretty_print=True).decode(enc, errors='ignore') for s in contentresult] + if isinstance(titleresult, str): + titles = [getSubject(titleresult)] + else: + titles = [getSubject(etree.tostring(s, method='text', encoding=enc).decode(enc, errors='ignore')) for s in titleresult] + + except IOError as e: + warning = 'WARNING: could not open URL; maybe content was moved?\n\n' + str(e) + + if file is not None: + file.close() + + if uri.startswith(cmdscheme) and process.wait() != 0: + warning = 'WARNING: process terminated with an error' + + if warning: + return {'content': content, 'titles': titles, 'warning': warning} + + # parse regex + if contentregex: + contents = [x for y in [re.findall(r'' + contentregex, c, re.S) for c in contents] for x in y] + if titleregex: + titles = [x for y in [re.findall(r'' + titleregex, c, re.S) for c in titles] for x in y] + + if contentregex and titleregex and len(contents) != len(titles): + warning = 'WARNING: number of title blocks (' + str(len(titles)) + ') does not match number of content blocks (' + str(len(contents)) + ') after regex' + elif contentregex and len(contents) == 0: + warning = 'WARNING: content regex became invalid!' + elif titleregex and len(titles) == 0: + warning = 'WARNING: title regex became invalid!' + else: + if len(contents) == 0: + contents = titles + if len(titles) == 0: + titles = [getSubject(c) for c in contents] + + return {'contents': contents, 'titles': titles, 'warning': warning} # returns a short subject line def getSubject(textContent): - if textContent == None or textContent == '': - return config.subjectPostfix - textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip() - return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent + if textContent == None or textContent == '': + return config.subjectPostfix + textContent = re.sub(' +', ' ', re.sub('\s', ' ', textContent)).strip() + return (textContent[:maxTitleLength] + ' [..]') if len(textContent) > maxTitleLength else textContent # generates a new RSS feed item def genFeedItem(subject, content, link, change): - feeditem = etree.Element('item') - titleitem = etree.Element('title') - titleitem.text = subject + ' #' + str(change) - feeditem.append(titleitem) - linkitem = etree.Element('link') - linkitem.text = link - feeditem.append(linkitem) - descriptionitem = etree.Element('description') - descriptionitem.text = content - feeditem.append(descriptionitem) - guiditem = etree.Element('guid') - guiditem.text = str(random.getrandbits(32)) - feeditem.append(guiditem) - dateitem = etree.Element('pubDate') - dateitem.text = strftime("%a, %d %b %Y %H:%M:%S %Z", time.localtime()) - feeditem.append(dateitem) - - return feeditem + feeditem = etree.Element('item') + titleitem = etree.Element('title') + titleitem.text = subject + ' #' + str(change) + feeditem.append(titleitem) + linkitem = etree.Element('link') + linkitem.text = link + feeditem.append(linkitem) + descriptionitem = etree.Element('description') + descriptionitem.text = content + feeditem.append(descriptionitem) + guiditem = etree.Element('guid') + guiditem.text = str(random.getrandbits(32)) + feeditem.append(guiditem) + dateitem = etree.Element('pubDate') + dateitem.text = strftime("%a, %d %b %Y %H:%M:%S %Z", time.localtime()) + feeditem.append(dateitem) + + return feeditem # sends mail notification def sendmail(receiver, subject, content, sendAsHtml, link): - global mailsession - - if sendAsHtml: - baseurl = None - if link != None: - content = '<p><a href="' + link + '">' + subject + '</a></p>\n' + content - baseurl = urljoin(link, '/') - mail = MIMEText('<html><head><title>' + subject + '</title>' + ('<base href="' + baseurl + '">' if baseurl else '') + '</head><body>' + content + '</body></html>', 'html', defaultEncoding) - else: - if link != None: - content = link + '\n\n' + content - mail = MIMEText(content, 'text', defaultEncoding) + global mailsession, defaultEncoding + + if sendAsHtml: + baseurl = None + if link != None: + content = '<p><a href="' + link + '">' + subject + '</a></p>\n' + content + baseurl = urljoin(link, '/') + mail = MIMEText('<html><head><title>' + subject + '</title>' + ('<base href="' + baseurl + '">' if baseurl else '') + '</head><body>' + content + '</body></html>', 'html', defaultEncoding) + else: + if link != None: + content = link + '\n\n' + content + mail = MIMEText(content, 'text', defaultEncoding) + + mail['From'] = config.sender + mail['To'] = receiver + mail['Subject'] = Header(subject, defaultEncoding) + + # initialize session once, not each time this method gets called + if mailsession is None: + mailsession = smtplib.SMTP(config.smtphost, config.smtpport) + if config.useTLS: + mailsession.ehlo() + mailsession.starttls() + if config.smtpusername is not None: + mailsession.login(config.smtpusername, config.smtppwd) + + mailsession.sendmail(config.sender, receiver.split(','), mail.as_string()) - mail['From'] = config.sender - mail['To'] = receiver - mail['Subject'] = Header(subject, defaultEncoding) - # initialize session once, not each time this method gets called - if mailsession is None: - mailsession = smtplib.SMTP(config.smtphost, config.smtpport) - if config.useTLS: - mailsession.ehlo() - mailsession.starttls() - if config.smtpusername is not None: - mailsession.login(config.smtpusername, config.smtppwd) - - mailsession.sendmail(config.sender, receiver.split(','), mail.as_string()) +# returns a list of all content that is stored locally for a specific site +def getStoredHashes(shortname): + result = [] + filename = shortname + ".txt" + if os.path.exists(filename): + with open(filename, 'r') as file: + for line in file: + result.append(line.rstrip()) -# returns a list of all content that is stored locally for a specific site -def getFileContents(shortname): - result = [] - for f in os.listdir('.'): - if f.startswith(shortname + '.') and f.endswith('.txt'): - file = open(f, 'rb') - result.append(file.read().decode('utf-8')) - file.close() - return result + return result # updates list of content that is stored locally for a specific site -def storeFileContents(shortname, contents): - for f in os.listdir('.'): - if f.startswith(shortname + '.') and f.endswith('.txt'): - os.remove(f) - - i = 0 - for c in contents: - file = open(shortname + '.' + str(i) + '.txt', 'wb') - file.write(c.encode('utf-8')) - file.close() - i += 1 +def storeHashes(shortname, contentHashes): + with open(shortname + '.txt', 'w') as file: + for h in contentHashes: + file.write(h + "\n") -def pollWebsites(): - # parse existing feed or create a new one - if config.enableRSSFeed: - if os.path.isfile(config.rssfile): - feedXML = etree.parse(config.rssfile) - else: - feedXML = etree.parse(io.StringIO(emptyfeed)) +def pollWebsites(): + global defaultEncoding - # start polling sites - sessionContents = [] - mailsSent = 0 - for site in config.sites: - if config.maxMailsPerSession != -1 and mailsSent >= config.maxMailsPerSession: - break + # parse existing feed or create a new one + if config.enableRSSFeed: + if os.path.isfile(config.rssfile): + feedXML = etree.parse(config.rssfile) + else: + feedXML = etree.parse(io.StringIO(emptyfeed)) + + # start polling sites + mailsSent = 0 + for site in config.sites: + print('polling site [' + site['shortname'] + '] ...') + sessionHashes = [] + parseResult = parseSite(site) + receiver = site.get('receiver', config.receiver) + + # if something went wrong, notify the user + if parseResult['warning']: + subject = '[' + site['shortname'] + '] WARNING' + print('WARNING: ' + parseResult['warning']) + if config.enableMailNotifications: + if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession: + sendmail(receiver, subject, parseResult['warning'], False, None) + mailsSent = mailsSent + 1 + if config.enableRSSFeed: + feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0)) + else: + # otherwise, check which parts of the site were updated + changes = 0 + fileHashes = getStoredHashes(site['shortname']) + i = 0 + for content in parseResult['contents']: + + contenthash = hashlib.md5(content.encode(defaultEncoding)).hexdigest() + if contenthash not in fileHashes: + if config.maxMailsPerSession == -1 or mailsSent < config.maxMailsPerSession: + changes += 1 + sessionHashes.append(contenthash) + + subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i] + print(' ' + subject) + if config.enableMailNotifications and len(fileHashes) > 0: + sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri']) + mailsSent = mailsSent + 1 - print('polling site [' + site['shortname'] + '] ...') - parseResult = parseSite(site) - receiver = site.get('receiver', config.receiver) - - # if something went wrong, notify the user - if parseResult['warning']: - subject = '[' + site['shortname'] + '] WARNING' - print('WARNING: ' + parseResult['warning']) - if config.enableMailNotifications: - sendmail(receiver, subject, parseResult['warning'], False, None) - mailsSent = mailsSent + 1 if config.enableRSSFeed: - feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0)) + feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes)) else: - # otherwise, check which parts of the site were updated - changes = 0 - fileContents = getFileContents(site['shortname']) - i = 0 - for content in parseResult['contents']: - if config.maxMailsPerSession != -1 and mailsSent >= config.maxMailsPerSession: - break - - if content not in fileContents: - changes += 1 - sessionContents.append(content) - - subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i] - print(' ' + subject) - if config.enableMailNotifications and len(fileContents) > 0: - sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri']) - mailsSent = mailsSent + 1 - - if config.enableRSSFeed: - feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes)) - i += 1 - - - if changes > 0: - storeFileContents(site['shortname'], sessionContents) - print(' ' + str(changes) + ' updates') - - # store feed - if config.enableRSSFeed: - for o in feedXML.xpath('//channel/item[position()<last()-' + str(config.maxFeeds - 1) + ']'): - o.getparent().remove(o) - file = open(config.rssfile, 'w') - file.write(etree.tostring(feedXML, pretty_print=True, xml_declaration=True, encoding=defaultEncoding).decode(defaultEncoding)) - file.close() + sessionHashes.append(contenthash) + i += 1 -if __name__ == "__main__": - configMod = 'config' - dryrun = None + if changes > 0: + storeHashes(site['shortname'], sessionHashes) + print(' ' + str(changes) + ' updates') - try: - opts, args = getopt.getopt(sys.argv[1:], 'hc:d:', ['help', 'config=', 'dry-run=']) - except getopt.GetoptError: - print('Usage: mwc.py --config=config --dry-run=shortname') - sys.exit(1) - for opt, arg in opts: - if opt == '-h': - print('Usage: mwc.py --config=config') - exit() - elif opt in ('-c', '--config'): - configMod = arg - elif opt in ('-d', '--dry-run'): - dryrun = arg - - config = importlib.import_module(configMod) - - if dryrun: - for site in config.sites: - if site['shortname'] == dryrun: - parseResult = parseSite(site) - print(parseResult) - print(str(len(parseResult['contents'])) + " results") - break - else: - try: - pollWebsites() - except: - msg = str(sys.exc_info()[0]) + '\n\n' + traceback.format_exc() - print(msg) - if config.receiver != '': - sendmail(config.receiver, '[mwc] Something went wrong ...', msg, False, None) - - if mailsession: - mailsession.quit() - mailsession = None + # store feed + if config.enableRSSFeed: + for o in feedXML.xpath('//channel/item[position()<last()-' + str(config.maxFeeds - 1) + ']'): + o.getparent().remove(o) + file = open(config.rssfile, 'w') + file.write(etree.tostring(feedXML, pretty_print=True, xml_declaration=True, encoding=defaultEncoding).decode(defaultEncoding, errors='ignore')) + file.close() + + +if __name__ == "__main__": + configMod = 'config' + dryrun = None + + try: + opts, args = getopt.getopt(sys.argv[1:], 'hc:d:', ['help', 'config=', 'dry-run=']) + except getopt.GetoptError: + print('Usage: mwc.py --config=config --dry-run=shortname') + sys.exit(1) + for opt, arg in opts: + if opt == '-h': + print('Usage: mwc.py --config=config') + exit() + elif opt in ('-c', '--config'): + configMod = arg + elif opt in ('-d', '--dry-run'): + dryrun = arg + + config = importlib.import_module(configMod) + + if dryrun: + for site in config.sites: + if site['shortname'] == dryrun: + parseResult = parseSite(site) + print(parseResult) + print(str(len(parseResult['contents'])) + " results") + break + else: + try: + pollWebsites() + except: + msg = str(sys.exc_info()[0]) + '\n\n' + traceback.format_exc() + print(msg) + if config.receiver != '': + sendmail(config.receiver, '[mwc] Something went wrong ...', msg, False, None) + + if mailsession: + mailsession.quit() + mailsession = None |