summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md3
-rw-r--r--config_template.py3
-rwxr-xr-xmwc.py23
3 files changed, 24 insertions, 5 deletions
diff --git a/README.md b/README.md
index d008527..8e78da6 100644
--- a/README.md
+++ b/README.md
@@ -54,6 +54,8 @@ sites = [
Regular expression. If XPath/CSS selector is defined, the regular expression is applied afterwards.
* <b>encoding</b> (optional; default: 'utf-8')
Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'.
+ * <b>splitregex</b> (optional)
+ only works if type is set to 'text'; defines that content should be split to chunks based on the defined regex expression.
* <b>receiver</b> (optional)
Overrides global receiver specification.
* <b>user-agent</b> (optional)
@@ -79,6 +81,7 @@ sites = [
<pre>
<code>
enableMailNotifications = True #enable/disable notification messages; if set to False, only send error messages
+maxMailsPerSession = -1 #max. number of mails to send per session; ignored when set to -1
subjectPostfix = 'A website has been updated!'
sender = 'me@mymail.com'
diff --git a/config_template.py b/config_template.py
index 02f7579..f394e52 100644
--- a/config_template.py
+++ b/config_template.py
@@ -15,7 +15,7 @@ sites = [
'titleregex': '',
'contentregex': '',
'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0',
- 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+ 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
'encoding': 'utf-8'},
{'shortname': 'mywebsite2',
@@ -42,6 +42,7 @@ sites = [
subjectPostfix = 'A website has been updated!'
enableMailNotifications = True
+maxMailsPerSession = -1
sender = 'me@mymail.com'
smtphost = 'mysmtpprovider.com'
useTLS = True
diff --git a/mwc.py b/mwc.py
index a0635a1..c420a74 100755
--- a/mwc.py
+++ b/mwc.py
@@ -69,6 +69,7 @@ def parseSite(site):
contenttype = site.get('type', 'html')
contentregex = site.get('contentregex', '')
titleregex = site.get('titleregex', '')
+ splitregex = site.get('splitregex', '')
enc = site.get('encoding', defaultEncoding)
contentxpath = site.get('contentxpath', '')
@@ -96,7 +97,10 @@ def parseSite(site):
if contenttype == 'text' or (contentxpath == '' and titlexpath == ''):
- contents = [file.read().decode(enc)]
+ thefullcontent = file.read().decode(enc)
+ contents = [thefullcontent]
+ if splitregex != '':
+ contents = thefullcontent.split(splitregex)
titles = []
else:
baseuri = uri
@@ -248,13 +252,13 @@ def getFileContents(shortname):
# updates list of content that is stored locally for a specific site
-def storeFileContents(shortname, parseResult):
+def storeFileContents(shortname, contents):
for f in os.listdir('.'):
if f.startswith(shortname + '.') and f.endswith('.txt'):
os.remove(f)
i = 0
- for c in parseResult['contents']:
+ for c in contents:
file = open(shortname + '.' + str(i) + '.txt', 'wb')
file.write(c.encode('utf-8'))
file.close()
@@ -271,7 +275,11 @@ def pollWebsites():
feedXML = etree.parse(io.StringIO(emptyfeed))
# start polling sites
+ sessionContents = []
+ mailsSent = 0
for site in config.sites:
+ if config.maxMailsPerSession != -1 and mailsSent >= config.maxMailsPerSession:
+ break
print('polling site [' + site['shortname'] + '] ...')
parseResult = parseSite(site)
@@ -283,6 +291,7 @@ def pollWebsites():
print('WARNING: ' + parseResult['warning'])
if config.enableMailNotifications:
sendmail(receiver, subject, parseResult['warning'], False, None)
+ mailsSent = mailsSent + 1
if config.enableRSSFeed:
feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0))
else:
@@ -291,13 +300,18 @@ def pollWebsites():
fileContents = getFileContents(site['shortname'])
i = 0
for content in parseResult['contents']:
+ if config.maxMailsPerSession != -1 and mailsSent >= config.maxMailsPerSession:
+ break
+
if content not in fileContents:
changes += 1
+ sessionContents.append(content)
subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i]
print(' ' + subject)
if config.enableMailNotifications and len(fileContents) > 0:
sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri'])
+ mailsSent = mailsSent + 1
if config.enableRSSFeed:
feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes))
@@ -305,7 +319,7 @@ def pollWebsites():
if changes > 0:
- storeFileContents(site['shortname'], parseResult)
+ storeFileContents(site['shortname'], sessionContents)
print(' ' + str(changes) + ' updates')
# store feed
@@ -343,6 +357,7 @@ if __name__ == "__main__":
if site['shortname'] == dryrun:
parseResult = parseSite(site)
print(parseResult)
+ print(str(len(parseResult['contents'])) + " results")
break
else:
try: