From 76ea31d1747d8d95ec7ac75be750176beb452f66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Sun, 6 Aug 2017 19:52:14 +0200 Subject: New upstream version 1.7.6 --- README.md | 3 +++ config_template.py | 3 ++- mwc.py | 23 +++++++++++++++++++---- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d008527..8e78da6 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,8 @@ sites = [ Regular expression. If XPath/CSS selector is defined, the regular expression is applied afterwards. * encoding (optional; default: 'utf-8') Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'. + * splitregex (optional) + only works if type is set to 'text'; defines that content should be split to chunks based on the defined regex expression. * receiver (optional) Overrides global receiver specification. * user-agent (optional) @@ -79,6 +81,7 @@ sites = [
 
 enableMailNotifications = True   #enable/disable notification messages; if set to False, only send error messages
+maxMailsPerSession = -1   #max. number of mails to send per session; ignored when set to -1
 subjectPostfix = 'A website has been updated!'
 
 sender = 'me@mymail.com'
diff --git a/config_template.py b/config_template.py
index 02f7579..f394e52 100644
--- a/config_template.py
+++ b/config_template.py
@@ -15,7 +15,7 @@ sites = [
            'titleregex': '',
            'contentregex': '',
            'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0',
-           'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+           'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
            'encoding': 'utf-8'},
 
           {'shortname': 'mywebsite2',
@@ -42,6 +42,7 @@ sites = [
 subjectPostfix = 'A website has been updated!'
 
 enableMailNotifications = True
+maxMailsPerSession = -1
 sender = 'me@mymail.com'
 smtphost = 'mysmtpprovider.com'
 useTLS = True
diff --git a/mwc.py b/mwc.py
index a0635a1..c420a74 100755
--- a/mwc.py
+++ b/mwc.py
@@ -69,6 +69,7 @@ def parseSite(site):
         contenttype = site.get('type', 'html')
         contentregex = site.get('contentregex', '')
         titleregex = site.get('titleregex', '')
+        splitregex = site.get('splitregex', '')
         enc = site.get('encoding', defaultEncoding)
 
         contentxpath = site.get('contentxpath', '')
@@ -96,7 +97,10 @@ def parseSite(site):
 
 
                 if contenttype == 'text' or (contentxpath == '' and titlexpath == ''):
-                        contents = [file.read().decode(enc)]
+                        thefullcontent = file.read().decode(enc)
+                        contents = [thefullcontent]
+                        if splitregex != '':
+                                contents = thefullcontent.split(splitregex)
                         titles = []
                 else:
                         baseuri = uri
@@ -248,13 +252,13 @@ def getFileContents(shortname):
 
 
 # updates list of content that is stored locally for a specific site
-def storeFileContents(shortname, parseResult):
+def storeFileContents(shortname, contents):
         for f in os.listdir('.'):
                 if f.startswith(shortname + '.') and f.endswith('.txt'):
                         os.remove(f)
 
         i = 0
-        for c in parseResult['contents']:
+        for c in contents:
                 file = open(shortname + '.' + str(i) + '.txt', 'wb')
                 file.write(c.encode('utf-8'))
                 file.close()
@@ -271,7 +275,11 @@ def pollWebsites():
                         feedXML = etree.parse(io.StringIO(emptyfeed))
 
         # start polling sites
+        sessionContents = []
+        mailsSent = 0
         for site in config.sites:
+                if config.maxMailsPerSession != -1 and mailsSent >= config.maxMailsPerSession:
+                        break
 
                 print('polling site [' + site['shortname'] + '] ...')
                 parseResult = parseSite(site)
@@ -283,6 +291,7 @@ def pollWebsites():
                         print('WARNING: ' + parseResult['warning'])
                         if config.enableMailNotifications:
                                 sendmail(receiver, subject, parseResult['warning'], False, None)
+                                mailsSent = mailsSent + 1
                         if config.enableRSSFeed:
                                 feedXML.xpath('//channel')[0].append(genFeedItem(subject, parseResult['warning'], site['uri'], 0))
                 else:
@@ -291,13 +300,18 @@ def pollWebsites():
                         fileContents = getFileContents(site['shortname'])
                         i = 0
                         for content in parseResult['contents']:
+                                if config.maxMailsPerSession != -1 and mailsSent >= config.maxMailsPerSession:
+                                        break
+
                                 if content not in fileContents:
                                         changes += 1
+                                        sessionContents.append(content)
 
                                         subject = '[' + site['shortname'] + '] ' + parseResult['titles'][i]
                                         print('    ' + subject)
                                         if config.enableMailNotifications and len(fileContents) > 0:
                                                 sendmail(receiver, subject, content, (site.get('type', 'html') == 'html'), site['uri'])
+                                                mailsSent = mailsSent + 1
 
                                         if config.enableRSSFeed:
                                                 feedXML.xpath('//channel')[0].append(genFeedItem(subject, content, site['uri'], changes))
@@ -305,7 +319,7 @@ def pollWebsites():
 
 
                         if changes > 0:
-                                storeFileContents(site['shortname'], parseResult)
+                                storeFileContents(site['shortname'], sessionContents)
                                 print('        ' + str(changes) + ' updates')
  
         # store feed
@@ -343,6 +357,7 @@ if __name__ == "__main__":
                         if site['shortname'] == dryrun:
                                 parseResult = parseSite(site)
                                 print(parseResult)
+                                print(str(len(parseResult['contents'])) + " results")
                                 break
         else:
                 try:
-- 
cgit v1.2.3