diff options
-rw-r--r-- | README.md | 3 | ||||
-rw-r--r-- | config_template.py | 3 | ||||
-rwxr-xr-x | mwc.py | 23 |
3 files changed, 24 insertions, 5 deletions
@@ -54,6 +54,8 @@ sites = [ Regular expression. If XPath/CSS selector is defined, the regular expression is applied afterwards. * <b>encoding</b> (optional; default: 'utf-8') Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'. + * <b>splitregex</b> (optional) + only works if type is set to 'text'; defines that content should be split to chunks based on the defined regex expression. * <b>receiver</b> (optional) Overrides global receiver specification. * <b>user-agent</b> (optional) @@ -79,6 +81,7 @@ sites = [ <pre> <code> enableMailNotifications = True #enable/disable notification messages; if set to False, only send error messages +maxMailsPerSession = -1 #max. number of mails to send per session; ignored when set to -1 subjectPostfix = 'A website has been updated!' sender = 'me@mymail.com' diff --git a/config_template.py b/config_template.py index 02f7579..f394e52 100644 --- a/config_template.py +++ b/config_template.py @@ -15,7 +15,7 @@ sites = [ 'titleregex': '', 'contentregex': '', 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0', - 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' 'encoding': 'utf-8'}, {'shortname': 'mywebsite2', @@ -42,6 +42,7 @@ sites = [ subjectPostfix = 'A website has been updated!' enableMailNotifications = True +maxMailsPerSession = -1 sender = 'me@mymail.com' smtphost = 'mysmtpprovider.com' useTLS = True @@ -69,6 +69,7 @@ def parseSite(site): contenttype = site.get('type', 'html') contentregex = site.get('contentregex', '') titleregex = site.get('titleregex', '') + splitregex = site.get('splitregex', '') enc = site.get('encoding', defaultEncoding) contentxpath = site.get('contentxpath', '') @@ -96,7 +97,10 @@ def parseSite(site): if contenttype == 'text' or (contentxpath == '' and titlexpath == ''): - contents = [file.read().decode(enc)] + thefullcontent = file.read().decode(enc) + contents = [thefullcontent] + if splitregex != '': + contents = thefullcontent.split(splitregex) titles = [] else: baseuri = uri @@ -248,13 +252,13 @@ def getFileContents(shortname): # updates list of content that is stored locally for a specific site -def storeFileContents(shortname, parseResult): +def storeFileContents(shortname, contents): for f in os.listdir('.'): if f.startswith(shortname + '.') and f.endswith('.txt'): os.remove(f) i = 0 - for c in parseResult['contents']: + for c in contents: file = open(shortname + '.' + str(i) + '.txt', 'wb') file.write(c.encode('utf-8')) file.close() @@ -271,7 +275,11 @@ def pollWebsites(): feedXML = etree.parse(io.StringIO(emptyfeed)) # start polling sites + sessionContents = [] + mailsSent = 0 for site in config.sites: + if config.maxMailsPerSession != -1 and mailsSent >= config.maxMailsPerSession: + break print('polling site [' + site['shortname'] + '] ...') parseResult = parseSite(site) @@ -283,6 +291,7 @@ def pollWebsites(): print('WARNING: ' + parseResult['warning']) if config.enableMailNotifications: sendmail(receiver, subject, parseResult['warning'], False, None) + mailsSent = mailsSent + 1 if config.enableRSSFeed: feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0)) else: @@ -291,13 +300,18 @@ def pollWebsites(): fileContents = getFileContents(site['shortname']) i = 0 for content in parseResult['contents']: + if config.maxMailsPerSession != -1 and mailsSent >= config.maxMailsPerSession: + break + if content not in fileContents: changes += 1 + sessionContents.append(content) subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i] print(' ' + subject) if config.enableMailNotifications and len(fileContents) > 0: sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri']) + mailsSent = mailsSent + 1 if config.enableRSSFeed: feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes)) @@ -305,7 +319,7 @@ def pollWebsites(): if changes > 0: - storeFileContents(site['shortname'], parseResult) + storeFileContents(site['shortname'], sessionContents) print(' ' + str(changes) + ' updates') # store feed @@ -343,6 +357,7 @@ if __name__ == "__main__": if site['shortname'] == dryrun: parseResult = parseSite(site) print(parseResult) + print(str(len(parseResult['contents'])) + " results") break else: try: |