package GCPlugins::GCbooks::GCAmazon; ################################################### # # Copyright 2005-2009 Tian # # This file is part of GCstar. # # GCstar is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # GCstar is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with GCstar; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA # ################################################### use strict; use utf8; use GCPlugins::GCbooks::GCbooksCommon; { package GCPlugins::GCbooks::GCPluginAmazon; use base qw(GCPlugins::GCbooks::GCbooksPluginsBase); use XML::Simple; use LWP::Simple qw($ua); use Encode; use HTML::Entities; use GCUtils; sub parse { my ($self, $page) = @_; return if $page =~ /^new; if ($self->{parsingList}) { $xml = $xs->XMLin($page, ForceArray => ['Item','Author'], KeyAttr => []); my $book; foreach $book ( @{ $xml -> {'Items'} -> {'Item'} }) { $self->{itemIdx}++; my $url = $self->baseAWSUrl."&Operation=ItemLookup&ResponseGroup=Large,EditorialReview&ItemId=".$book->{ASIN}; $self->{itemsList}[$self->{itemIdx}]->{url} = $url; $self->{itemsList}[$self->{itemIdx}]->{title} = $book->{ItemAttributes}->{'Title'}; for my $author (@{$book->{ItemAttributes}->{'Author'}}) { $self->{itemsList}[$self->{itemIdx}]->{authors} .= ", " if $self->{itemsList}[$self->{itemIdx}]->{authors}; $self->{itemsList}[$self->{itemIdx}]->{authors} .= $author; } $self->{itemsList}[$self->{itemIdx}]->{publication} = $book->{ItemAttributes}->{'PublicationDate'}; $self->{itemsList}[$self->{itemIdx}]->{format} = $book->{ItemAttributes}->{'Binding'}; $self->{itemsList}[$self->{itemIdx}]->{edition} = $book->{ItemAttributes}->{'Edition'}; } } else { $xml = $xs->XMLin($page, ForceArray => ['Author','EditorialReview','Language'], KeyAttr => []); $self->{curInfo}->{title} = $xml->{Items}->{Item}->{ItemAttributes}->{Title}; for my $author (@{$xml->{Items}->{Item}->{ItemAttributes}->{Author}}) { push @{$self->{curInfo}->{authors}}, [$author]; } my $htmlDescription; if ($xml->{Items}->{Item}->{EditorialReviews}->{EditorialReview}[0]->{Content}) { $htmlDescription = $xml->{Items}->{Item}->{EditorialReviews}->{EditorialReview}[0]->{Content}; } else { # Unfortunately the api doesn't always return the product description, which is due to # copyright concerns or something. In this case, grab the product html and parse it for # the description. my $response = $ua->get($xml->{Items}->{Item}->{DetailPageURL}); my $result; eval { $result = $response->decoded_content; }; # Replace some bad characters. TODO - will probably need to extend this for de/jp plugins $result =~ s|\x{92}|'|gi; $result =~ s|’|'|gi; $result =~ s|•|*|gi; $result =~ s|œ|oe|gi; $result =~ s|…|...|gi; $result =~ s|\x{85}|...|gi; $result =~ s|\x{8C}|OE|gi; $result =~ s|\x{9C}|oe|gi; $result =~ s|ü|ü|g; $result =~ s|ß|ß|g; $result =~ s|ö|ö|g; $result =~ s|Ü|Ü|g; $result =~ s|ä|ä|g; $result =~ s/„/»/gm; $result =~ s/“/«/gm; # Chop out the product description $result =~ /
(.*?)<(\/)*?div/s; $htmlDescription = $1; # Decode decode_entities($htmlDescription); $htmlDescription = decode('ISO-8859-1', $htmlDescription); } # Replace some html with line breaks, strip out the rest $htmlDescription =~ s/
/\n/ig; $htmlDescription =~ s/

/\n\n/ig; $htmlDescription =~ s/<(.*?)>//gi; $htmlDescription =~ s/^\s*//; $htmlDescription =~ s/\s*$//; $htmlDescription =~ s/ {1,}/ /g; $self->{curInfo}->{description} = $htmlDescription; $self->{curInfo}->{publisher} = $xml->{Items}->{Item}->{ItemAttributes}->{Publisher} if (!ref($xml->{Items}->{Item}->{ItemAttributes}->{Publisher})); $self->{curInfo}->{publication} = $xml->{Items}->{Item}->{ItemAttributes}->{PublicationDate} if (!ref($xml->{Items}->{Item}->{ItemAttributes}->{PublicationDate})); $self->{curInfo}->{language} = $xml->{Items}->{Item}->{ItemAttributes}->{Languages}->{Language}[0]->{Name} if (ref($xml->{Items}->{Item}->{ItemAttributes}->{Languages}->{Language})); $self->{curInfo}->{pages} = $xml->{Items}->{Item}->{ItemAttributes}->{NumberOfPages} if (!ref($xml->{Items}->{Item}->{ItemAttributes}->{NumberOfPages})); $self->{curInfo}->{isbn} = $xml->{Items}->{Item}->{ItemAttributes}->{EAN} if (!ref($xml->{Items}->{Item}->{ItemAttributes}->{EAN})); $self->{curInfo}->{format} = $xml->{Items}->{Item}->{ItemAttributes}->{Binding} if (!ref($xml->{Items}->{Item}->{ItemAttributes}->{Binding})); $self->{curInfo}->{edition} = $xml->{Items}->{Item}->{ItemAttributes}->{Edition} if (!ref($xml->{Items}->{Item}->{ItemAttributes}->{Edition})); $self->{curInfo}->{web} = $xml->{Items}->{Item}->{DetailPageURL}; # Genre handling via Amazon's browsenodes. Stupidly complicated way of doing things, IMO # Loop through all the nodes: for my $node (@{$xml->{Items}->{Item}->{BrowseNodes}->{BrowseNode}}) { my $genre = ''; my $ancestor = $node; # Push the lowest node to the temporary genre list my @genre_list = ($node->{Name}); # Start stepping down through the current node to find it's children while ($ancestor->{Ancestors}->{BrowseNode}) { $ancestor = $ancestor->{Ancestors}->{BrowseNode}; if (($ancestor->{Name} eq 'Specialty Stores') || ($ancestor->{Name} eq 'Refinements') || ($ancestor->{Name} eq 'Self Service') || ($ancestor->{Name} eq 'Specialty Boutique')) { # Some categories we definetly want to ignore, since they are full of rubbish tags $genre = 'ignore'; last; } elsif ($ancestor->{Name} =~ m/A\-Z/) { # Clear out the current genres from the node, will be full of rubbish like "Authors A-K" # Keep looping afterwards though, since there could be valid tags below the author # specific ones undef(@genre_list); } elsif ($ancestor->{Name} eq 'Subjects') { # Don't go deeper than a Subjects node last; } else { # Add the current node to the temporary list, if it's not already included in either list push @genre_list, $ancestor->{Name} if ((!GCUtils::inArrayTest($ancestor->{Name}, @genre_list)) && (!GCUtils::inArrayTest($ancestor->{Name}, @{$self->{curInfo}->{genre}}))); } } if ($genre ne 'ignore') { # Add temporary list to item info push @{$self->{curInfo}->{genre}}, [$_] foreach @genre_list; } } # Let's sort the list for good measure @{$self->{curInfo}->{genre}} = sort @{$self->{curInfo}->{genre}}; # Fetch either the big original pic, or just the small thumbnail pic if ($self->{bigPics}) { $self->{curInfo}->{cover} = $xml->{Items}->{Item}->{LargeImage}->{URL}; } else { $self->{curInfo}->{cover} = $xml->{Items}->{Item}->{SmallImage}->{URL}; } } } sub new { my $proto = shift; my $class = ref($proto) || $proto; my $self = $class->SUPER::new(); bless ($self, $class); $self->{hasField} = { title => 1, authors => 1, publication => 1, format => 1, edition => 1, }; return $self; } sub getItemUrl { my ($self, $url) = @_; if (!$url) { # If we're not passed a url, return a hint so that gcstar knows what type # of addresses this plugin handles $url = "http://".$self->baseWWWamazonUrl(); } elsif ($url !~ m/sowacs.appspot.com/) { # Convert amazon url to aws url $url =~ /\/dp\/(\w*)[\/|%3F]/; my $asinid = $1; $url = $self->baseAWSUrl."&Operation=ItemLookup&ResponseGroup=Large,EditorialReview&ItemId=".$asinid; } return $url; } sub preProcess { my ($self, $html) = @_; return $html; } sub decodeEntitiesWanted { return 0; } sub getSearchUrl { my ($self, $word) = @_; my $key = ($self->{searchField} eq 'authors') ? 'Author' : ($self->{searchField} eq 'title') ? 'Title' : ($self->{searchField} eq 'isbn') ? 'Keywords' : ''; $word =~ s/\D//g if $key eq 'Keywords'; return $self->baseAWSUrl."&Operation=ItemSearch&$key=$word&SearchIndex=Books&ResponseGroup=Medium"; } sub baseAWSUrl { my $self = shift; return "http://sowacs.appspot.com/AWS/%5Bamazon\@gcstar.org%5D".$self->baseAmazonUrl()."/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=AKIAJJ5TJWI62A5OOTQQ&AssociateTag=AKIAJJ5TJWI62A5OOTQQ"; } sub baseAmazonUrl { return "ecs.amazonaws.com"; } sub baseWWWamazonUrl { return "www.amazon.com"; } sub changeUrl { my ($self, $url) = @_; # Make sure the url is for the api, not the main movie page return $self->getItemUrl($url); } sub getName { return "Amazon (US)"; } sub getAuthor { return 'Zombiepig'; } sub getLang { return 'EN'; } sub getCharset { my $self = shift; return "UTF-8"; } sub getSearchCharset { my $self = shift; # Need urls to be double character encoded return "utf8"; } sub convertCharset { my ($self, $value) = @_; return $value; } sub getNotConverted { my $self = shift; return []; } sub isPreferred { return 1; } sub getSearchFieldsArray { return ['title', 'authors', 'isbn']; } } 1;