diff options
Diffstat (limited to 'lib/gcstar/GCPlugins/GCbooks/GCAmazon.pm')
-rw-r--r-- | lib/gcstar/GCPlugins/GCbooks/GCAmazon.pm | 352 |
1 files changed, 352 insertions, 0 deletions
diff --git a/lib/gcstar/GCPlugins/GCbooks/GCAmazon.pm b/lib/gcstar/GCPlugins/GCbooks/GCAmazon.pm new file mode 100644 index 0000000..7d70ec4 --- /dev/null +++ b/lib/gcstar/GCPlugins/GCbooks/GCAmazon.pm @@ -0,0 +1,352 @@ +package GCPlugins::GCbooks::GCAmazon; + +################################################### +# +# Copyright 2005-2009 Tian +# +# This file is part of GCstar. +# +# GCstar is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# GCstar is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCstar; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA +# +################################################### + +use strict; +use utf8; + +use GCPlugins::GCbooks::GCbooksCommon; + +{ + package GCPlugins::GCbooks::GCPluginAmazon; + + use base qw(GCPlugins::GCbooks::GCbooksPluginsBase); + use XML::Simple; + use LWP::Simple qw($ua); + use Encode; + use HTML::Entities; + use GCUtils; + + sub parse + { + my ($self, $page) = @_; + return if $page =~ /^<!DOCTYPE html/; + my $xml; + my $xs = XML::Simple->new; + + if ($self->{parsingList}) + { + $xml = $xs->XMLin($page, ForceArray => ['Item','Author'], KeyAttr => []); + my $book; + foreach $book ( @{ $xml -> {'Items'} -> {'Item'} }) + { + $self->{itemIdx}++; + my $url = $self->baseAWSUrl."&Operation=ItemLookup&ResponseGroup=Large,EditorialReview&ItemId=".$book->{ASIN}; + + $self->{itemsList}[$self->{itemIdx}]->{url} = $url; + $self->{itemsList}[$self->{itemIdx}]->{title} = $book->{ItemAttributes}->{'Title'}; + for my $author (@{$book->{ItemAttributes}->{'Author'}}) + { + $self->{itemsList}[$self->{itemIdx}]->{authors} .= ", " + if $self->{itemsList}[$self->{itemIdx}]->{authors}; + $self->{itemsList}[$self->{itemIdx}]->{authors} .= $author; + } + $self->{itemsList}[$self->{itemIdx}]->{publication} = $book->{ItemAttributes}->{'PublicationDate'}; + $self->{itemsList}[$self->{itemIdx}]->{format} = $book->{ItemAttributes}->{'Binding'}; + $self->{itemsList}[$self->{itemIdx}]->{edition} = $book->{ItemAttributes}->{'Edition'}; + } + } + else + { + $xml = $xs->XMLin($page, ForceArray => ['Author','EditorialReview','Language'], KeyAttr => []); + $self->{curInfo}->{title} = $xml->{Items}->{Item}->{ItemAttributes}->{Title}; + for my $author (@{$xml->{Items}->{Item}->{ItemAttributes}->{Author}}) + { + push @{$self->{curInfo}->{authors}}, [$author]; + } + + my $htmlDescription; + if ($xml->{Items}->{Item}->{EditorialReviews}->{EditorialReview}[0]->{Content}) + { + $htmlDescription = $xml->{Items}->{Item}->{EditorialReviews}->{EditorialReview}[0]->{Content}; + } + else + { + # Unfortunately the api doesn't always return the product description, which is due to + # copyright concerns or something. In this case, grab the product html and parse it for + # the description. + my $response = $ua->get($xml->{Items}->{Item}->{DetailPageURL}); + my $result; + eval { + $result = $response->decoded_content; + }; + + # Replace some bad characters. TODO - will probably need to extend this for de/jp plugins + $result =~ s|\x{92}|'|gi; + $result =~ s|’|'|gi; + $result =~ s|•|*|gi; + $result =~ s|œ|oe|gi; + $result =~ s|…|...|gi; + $result =~ s|\x{85}|...|gi; + $result =~ s|\x{8C}|OE|gi; + $result =~ s|\x{9C}|oe|gi; + $result =~ s|ü|ü|g; + $result =~ s|ß|ß|g; + $result =~ s|ö|ö|g; + $result =~ s|Ü|Ü|g; + $result =~ s|ä|ä|g; + $result =~ s/„/»/gm; + $result =~ s/“/«/gm; + + # Chop out the product description + $result =~ /<div class="productDescriptionWrapper">(.*?)<(\/)*?div/s; + $htmlDescription = $1; + + # Decode + decode_entities($htmlDescription); + $htmlDescription = decode('ISO-8859-1', $htmlDescription); + } + + # Replace some html with line breaks, strip out the rest + $htmlDescription =~ s/<br>/\n/ig; + $htmlDescription =~ s/<p>/\n\n/ig; + $htmlDescription =~ s/<(.*?)>//gi; + $htmlDescription =~ s/^\s*//; + $htmlDescription =~ s/\s*$//; + $htmlDescription =~ s/ {1,}/ /g; + $self->{curInfo}->{description} = $htmlDescription; + + $self->{curInfo}->{publisher} = $xml->{Items}->{Item}->{ItemAttributes}->{Publisher} + if (!ref($xml->{Items}->{Item}->{ItemAttributes}->{Publisher})); + $self->{curInfo}->{publication} = $xml->{Items}->{Item}->{ItemAttributes}->{PublicationDate} + if (!ref($xml->{Items}->{Item}->{ItemAttributes}->{PublicationDate})); + $self->{curInfo}->{language} = $xml->{Items}->{Item}->{ItemAttributes}->{Languages}->{Language}[0]->{Name} + if (ref($xml->{Items}->{Item}->{ItemAttributes}->{Languages}->{Language})); + $self->{curInfo}->{pages} = $xml->{Items}->{Item}->{ItemAttributes}->{NumberOfPages} + if (!ref($xml->{Items}->{Item}->{ItemAttributes}->{NumberOfPages})); + $self->{curInfo}->{isbn} = $xml->{Items}->{Item}->{ItemAttributes}->{EAN} + if (!ref($xml->{Items}->{Item}->{ItemAttributes}->{EAN})); + $self->{curInfo}->{format} = $xml->{Items}->{Item}->{ItemAttributes}->{Binding} + if (!ref($xml->{Items}->{Item}->{ItemAttributes}->{Binding})); + $self->{curInfo}->{edition} = $xml->{Items}->{Item}->{ItemAttributes}->{Edition} + if (!ref($xml->{Items}->{Item}->{ItemAttributes}->{Edition})); + $self->{curInfo}->{web} = $xml->{Items}->{Item}->{DetailPageURL}; + + # Genre handling via Amazon's browsenodes. Stupidly complicated way of doing things, IMO + # Loop through all the nodes: + for my $node (@{$xml->{Items}->{Item}->{BrowseNodes}->{BrowseNode}}) + { + my $genre = ''; + my $ancestor = $node; + + # Push the lowest node to the temporary genre list + my @genre_list = ($node->{Name}); + + # Start stepping down through the current node to find it's children + while ($ancestor->{Ancestors}->{BrowseNode}) + { + $ancestor = $ancestor->{Ancestors}->{BrowseNode}; + if (($ancestor->{Name} eq 'Specialty Stores') || + ($ancestor->{Name} eq 'Refinements') || + ($ancestor->{Name} eq 'Self Service') || + ($ancestor->{Name} eq 'Specialty Boutique')) + { + # Some categories we definetly want to ignore, since they are full of rubbish tags + $genre = 'ignore'; + last; + } + elsif ($ancestor->{Name} =~ m/A\-Z/) + { + # Clear out the current genres from the node, will be full of rubbish like "Authors A-K" + # Keep looping afterwards though, since there could be valid tags below the author + # specific ones + undef(@genre_list); + } + elsif ($ancestor->{Name} eq 'Subjects') + { + # Don't go deeper than a Subjects node + last; + } + else + { + # Add the current node to the temporary list, if it's not already included in either list + push @genre_list, $ancestor->{Name} + if ((!GCUtils::inArrayTest($ancestor->{Name}, @genre_list)) && + (!GCUtils::inArrayTest($ancestor->{Name}, @{$self->{curInfo}->{genre}}))); + } + } + + if ($genre ne 'ignore') + { + # Add temporary list to item info + push @{$self->{curInfo}->{genre}}, [$_] foreach @genre_list; + } + } + + # Let's sort the list for good measure + @{$self->{curInfo}->{genre}} = sort @{$self->{curInfo}->{genre}}; + + + # Fetch either the big original pic, or just the small thumbnail pic + if ($self->{bigPics}) + { + $self->{curInfo}->{cover} = $xml->{Items}->{Item}->{LargeImage}->{URL}; + } + else + { + $self->{curInfo}->{cover} = $xml->{Items}->{Item}->{SmallImage}->{URL}; + } + } + } + + sub new + { + my $proto = shift; + my $class = ref($proto) || $proto; + my $self = $class->SUPER::new(); + bless ($self, $class); + + $self->{hasField} = { + title => 1, + authors => 1, + publication => 1, + format => 1, + edition => 1, + }; + + return $self; + } + + sub getItemUrl + { + my ($self, $url) = @_; + if (!$url) + { + # If we're not passed a url, return a hint so that gcstar knows what type + # of addresses this plugin handles + $url = "http://".$self->baseWWWamazonUrl(); + } + elsif ($url !~ m/sowacs.appspot.com/) + { + # Convert amazon url to aws url + $url =~ /\/dp\/(\w*)[\/|%3F]/; + my $asinid = $1; + $url = $self->baseAWSUrl."&Operation=ItemLookup&ResponseGroup=Large,EditorialReview&ItemId=".$asinid; + } + return $url; + } + + sub preProcess + { + my ($self, $html) = @_; + + return $html; + } + + sub decodeEntitiesWanted + { + return 0; + } + + sub getSearchUrl + { + my ($self, $word) = @_; + + my $key = + ($self->{searchField} eq 'authors') ? 'Author' : + ($self->{searchField} eq 'title') ? 'Title' : + ($self->{searchField} eq 'isbn') ? 'Keywords' : + ''; + $word =~ s/\D//g + if $key eq 'Keywords'; + return $self->baseAWSUrl."&Operation=ItemSearch&$key=$word&SearchIndex=Books&ResponseGroup=Medium"; + } + + sub baseAWSUrl + { + my $self = shift; + return "http://sowacs.appspot.com/AWS/%5Bamazon\@gcstar.org%5D".$self->baseAmazonUrl()."/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=AKIAJJ5TJWI62A5OOTQQ&AssociateTag=AKIAJJ5TJWI62A5OOTQQ"; + } + + sub baseAmazonUrl + { + return "ecs.amazonaws.com"; + } + + sub baseWWWamazonUrl + { + return "www.amazon.com"; + } + + sub changeUrl + { + my ($self, $url) = @_; + # Make sure the url is for the api, not the main movie page + return $self->getItemUrl($url); + } + + sub getName + { + return "Amazon (US)"; + } + + sub getAuthor + { + return 'Zombiepig'; + } + + sub getLang + { + return 'EN'; + } + + sub getCharset + { + my $self = shift; + + return "UTF-8"; + } + + sub getSearchCharset + { + my $self = shift; + + # Need urls to be double character encoded + return "utf8"; + } + + sub convertCharset + { + my ($self, $value) = @_; + return $value; + } + + sub getNotConverted + { + my $self = shift; + return []; + } + + sub isPreferred + { + return 1; + } + + sub getSearchFieldsArray + { + return ['title', 'authors', 'isbn']; + } + +} + +1; |