From b329a335ad10648e64b3de049c083bc04638a072 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Fri, 19 Sep 2014 08:48:17 +0200 Subject: Imported Upstream version 1.7.2 --- README.md | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 README.md (limited to 'README.md') diff --git a/README.md b/README.md new file mode 100644 index 0000000..7419ad2 --- /dev/null +++ b/README.md @@ -0,0 +1,125 @@ +# MailWebsiteChanges + +Python script to keep track of website changes; sends email notifications on updates and/or also provides an RSS feed + +To specify which parts of a website should be monitored, XPath selectors (e.g. "//h1"), CSS selectors (e.g. "h1"), and regular expressions can be used (just choose the tools you like!). + +MailWebsiteChanges is related to PageMonitor for Chrome and AlertBox / Check4Change for Firefox. However, instead of living in your web browser, you can run it independently from command line / bash and install it as a simple cron job running on your linux server. + + +This is Open Source -- so please contribute eagerly! ;-) + + +## Configuration +Configuration can be done by creating a config.py file (just place this file in the program folder): +Some examples: + +### Website definitions +
+
+sites = [
+
+          {'shortname': 'mywebsite1',
+           'uri': 'http://www.mywebsite1.com/info',
+           'contentcss': 'div'},
+
+          {'shortname': 'mywebsite2',
+           'uri': 'http://www.mywebsite2.com/info',
+           'contentxpath': '//*[contains(concat(\' \', normalize-space(@class), \' \'), \' news-list-container \')]',
+           'titlexpath': '//title'},
+
+          {'shortname': 'mywebsite3',
+           'uri': 'http://www.mywebsite3.com/info',
+           'type': 'text',
+           'contentregex': 'Version\"\:\d*\.\d*'}
+
+]
+
+
+ + * parameters: + + * shortname + short name of the entry, used as an identifier when sending email notifications + * uri + URI of the website; If the scheme of the uri is 'cmd://', the string is interpreted as a command and the standard output (stdout) is parsed. + * type (optional; default: 'html') + content type, e.g., 'xml'/'html'/'text'. + * contentxpath / titlexpath (optional) + XPath expression for the content/title sections to extract. If you prefer, you could use contentcss/titlecss instead. + * contentcss / titlecss (optional) + CSS expression for the content/title sections to extract. This is ignored if there is a corresponding XPath definition. + * contentregex / titleregex (optional) + Regular expression. If XPath/CSS selector is defined, the regular expression is applied afterwards. + * encoding (optional; default: 'utf-8') + Character encoding of the website, e.g., 'utf-8' or 'iso-8859-1'. + * receiver (optional) + Overwrites global receiver specification. + + + * We collect some XPath/CSS snippets at this place: Snippet collection - please feel free to add your own definitions! + + * The --dry-run="shortname" option might be useful in order to validate and fine-tune a definition. + + * If you would like to keep the data stored in a different place than the working directory, you can include something like this: +
+   
+  os.chdir('/path/to/data/directory')
+   
+  
+ +### Mail settings +
+
+enableMailNotifications = True   #enable/disable notification messages; if set to False, only send error messages
+subjectPostfix = 'A website has been updated!'
+
+sender = 'me@mymail.com'
+smtphost = 'mysmtpprovider.com'
+useTLS = True
+smtpport = 587
+smtpusername = sender
+smtppwd = 'mypassword'
+receiver = 'me2@mymail.com'   # set to '' to also disable notifications in case of errors (not recommended)
+
+
+ + +### RSS Feeds +If you prefer to use the RSS feature, you just have to specify the path of the feed file which should be generated by the script (e.g., rssfile = 'feed.xml') and then point your webserver to that file. You can also invoke the mwcfeedserver.py script which implements a very basic webserver. + +
+ 
+enableRSSFeed = True   #enable/disable RSS feed
+
+rssfile = 'feed.xml'
+maxFeeds = 100
+ 
+
+ + +### Program execution +To setup a job that periodically runs the script, simply attach something like this to your /etc/crontab: +
+ 
+0 8-22/2    * * *   root	/usr/bin/python3 /usr/bin/mwc
+ 
+
+This will run the script every two hours between 8am and 10pm. + +If you prefer invoking the script with an alternate configuration files, simply pass the name of the configuration file as an argument, e.g., for my_alternate_config.py, use mwc --config=my_alternate_config. + + +## Requirements +Requires Python 3, lxml, and cssselect. +For Ubuntu 12.04, type: + + * sudo apt-get install python3 python3-dev python3-setuptools libxml2 libxslt1.1 libxml2-dev libxslt1-dev python-libxml2 python-libxslt1 + * sudo easy\_install3 pip + * sudo pip-3.2 install lxml cssselect + +For Ubuntu 14.04, type: + + * sudo apt-get install python3-lxml python3-pip + * sudo pip3 install cssselect + -- cgit v1.2.3