diff options
author | Jörg Frings-Fürst <jff@merkur> | 2014-07-06 15:20:38 +0200 |
---|---|---|
committer | Jörg Frings-Fürst <jff@merkur> | 2014-07-06 15:20:38 +0200 |
commit | 126bb8cb6b93240bb4d3a2b816b74c286c3d422b (patch) | |
tree | e66e1dfe77d53a52539489765c88d23e4423ae27 /lib/gcstar/GCPlugins/GCcomics/GCcomicbookdb.pm |
Imported Upstream version 1.7.0upstream/1.7.0
Diffstat (limited to 'lib/gcstar/GCPlugins/GCcomics/GCcomicbookdb.pm')
-rw-r--r-- | lib/gcstar/GCPlugins/GCcomics/GCcomicbookdb.pm | 546 |
1 files changed, 546 insertions, 0 deletions
diff --git a/lib/gcstar/GCPlugins/GCcomics/GCcomicbookdb.pm b/lib/gcstar/GCPlugins/GCcomics/GCcomicbookdb.pm new file mode 100644 index 0000000..80b299b --- /dev/null +++ b/lib/gcstar/GCPlugins/GCcomics/GCcomicbookdb.pm @@ -0,0 +1,546 @@ +package GCPlugins::GCcomics::GCcomicbookdb; + +################################################### +# +# Copyright 2005-2012 Christian Jodar +# +# This file is part of GCstar. +# +# GCstar is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# GCstar is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCstar; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA +# +################################################### + +use strict; +use utf8; + +use GCPlugins::GCcomics::GCcomicsCommon; + +{ + + package GCPlugins::GCcomics::GCPlugincomicbookdb; + + use LWP::Simple qw($ua); + use HTTP::Cookies; + + use base qw(GCPlugins::GCcomics::GCcomicsPluginsBase); + + sub start + { + my ($self, $tagname, $attr, $attrseq, $origtext) = @_; + + if ($self->{pass} == 1) + { + # First pass, searching for series name + if ($tagname eq "h2") + { + $self->{isAtResults} = 1; + } + if ( ($tagname eq "a") + && ($self->{isAtResults}) + && !($attr->{href} =~ m/ebay\.com/)) + { + $self->{isCollection} = 1; + $self->{itemIdx}++; + + $self->{itemsList}[ $self->{itemIdx} ]->{nextUrl} = + "http://www.comicbookdb.com/" . $attr->{href}; + } + } + else + { + # Second pass, or fetching item info + if ($self->{parsingList}) + { + + if ( ($tagname eq "tbody") + && ($self->{isResultsTable}) + && ($self->{isSpecialIssue} == 1)) + { + $self->{isSpecialIssue} = 2; + } + # Parsing issue list + if (($tagname eq "a") && ($self->{isResultsTable})) + { + if ($attr->{href} =~ m/javascript/) + { + # Multiple editions of the one issue, need to be + # handled differently + $self->{isSpecialIssue} = 1; + } + elsif ($attr->{href} =~ m/storyarc.php/) + { + # Prevent story arcs from populating lists + } + elsif ($self->{isSpecialIssue} == 1) + { + $self->{resultsTableColumn}++; + if ($self->{resultsTableColumn} == 1) + { + $self->{isSpecialIssueNo} = 1; + $self->{isIssue} = 1; + $self->{itemIdx}++; + $self->{itemsList}[ $self->{itemIdx} ]->{url} = + "http://www.comicbookdb.com/" . $attr->{href}; + } + elsif ($self->{resultsTableColumn} == 2) + { + $self->{isTitle} = 1; + $self->{isSpecialTitle} = 1; + } + } + elsif ($self->{isSpecialIssue} == 2) + { + $self->{itemIdx}++; + $self->{itemsList}[ $self->{itemIdx} ]->{url} = + "http://www.comicbookdb.com/" . $attr->{href}; + $self->{isTitle} = 1; + } + else + { + $self->{resultsTableColumn}++; + if ($self->{resultsTableColumn} == 1) + { + $self->{isIssue} = 1; + $self->{itemIdx}++; + $self->{itemsList}[ $self->{itemIdx} ]->{url} = + "http://www.comicbookdb.com/" . $attr->{href}; + } + elsif ($self->{resultsTableColumn} == 2) + { + $self->{isTitle} = 1; + } + } + } + } + else + { + # Fetching item info + if ( ($tagname eq "span") + && ((index $attr->{class}, "page_headline") > -1)) + { + $self->{insideHeadline} = 1; + } + elsif (($tagname eq "a") + && ($self->{insideHeadline}) + && ($attr->{href} =~ m/title.php/)) + { + $self->{insideName} = 1; + } + elsif (($tagname eq "a") + && ($self->{insideHeadline}) + && ($attr->{href} =~ m/issue_number.php/)) + { + $self->{insideNumber} = 1; + } + elsif (($tagname eq "a") && ($self->{nextisWriters})) + { + $self->{insideWriters} = 1; + $self->{insidePencillers} = 0; + $self->{insideColorists} = 0; + } + elsif (($tagname eq "a") && ($self->{nextisPencillers})) + { + $self->{insideWriters} = 0; + $self->{insidePencillers} = 1; + $self->{insideColorists} = 0; + } + elsif (($tagname eq "a") && ($self->{nextisColorists})) + { + $self->{insideWriters} = 0; + $self->{insidePencillers} = 0; + $self->{insideColorists} = 1; + } + elsif (($tagname eq "a") && ($attr->{href} =~ /imprint.php/)) + { + $self->{insidePublisher} = 1; + } + elsif (($tagname eq "a") + && ($attr->{href} =~ /publisher.php/) + && (!$self->{curInfo}->{publisher})) + { + $self->{insidePublisher} = 1; + } + elsif (($tagname eq "a") && ($attr->{href} =~ /coverdate.php/)) + { + $self->{insideCoverDate} = 1; + } + if ( ($tagname eq "span") + && ((index $attr->{class}, "test") > -1) + && ((index $attr->{class}, "page_subheadline") > -1)) + { + $self->{insideSubHeadline} = 1; + } + elsif (($tagname eq "a") + && ($attr->{href} =~ /^graphics\/comic_graphics\//)) + { + $self->{curInfo}->{image} = + "http://www.comicbookdb.com/" . $attr->{href}; + } + elsif (($tagname eq "img") + && ($attr->{src} =~ /^graphics\/comic_graphics\//) + && (!$self->{curInfo}->{image})) + { + $self->{curInfo}->{image} = + "http://www.comicbookdb.com/" . $attr->{src}; + } + + } + } + } + + sub end + { + my ($self, $tagname) = @_; + $self->{inside}->{$tagname}--; + + if ($self->{isResultsTable}) + { + if ($tagname eq "table") + { + $self->{isResultsTable} = 0; + } + elsif ($tagname eq "tr") + { + $self->{resultsTableColumn} = 0; + } + } + + if ($tagname eq "tbody") + { + $self->{isSpecialIssue} = 0; + } + elsif ($tagname eq "span") + { + $self->{insideHeadline} = 0; + $self->{insideSubHeadline} = 0; + $self->{insideNumber} = 0; + } + elsif ($tagname eq "td") + { + $self->{isAtResults} = 0; + $self->{nextisWriters} = 0; + $self->{nextisPencillers} = 0; + $self->{nextisColorists} = 0; + $self->{insideWriters} = 0; + $self->{insidePencillers} = 0; + $self->{insideColorists} = 0; + } + elsif ($tagname eq "a") + { + $self->{insidePublisher} = 0; + $self->{insideCoverDate} = 0; + } + } + + sub text + { + my ($self, $origtext) = @_; + + return if ($origtext eq " "); + + return if ($self->{parsingEnded}); + + if ($self->{parsingList}) + { + if ($self->{isCollection}) + { + $self->{itemsList}[ $self->{itemIdx} ]->{series} = $origtext; + $self->{isCollection} = 0; + } + if ($origtext eq "Cover Date") + { + $self->{isResultsTable} = 1; + } + if ($self->{isIssue}) + { + $self->{itemsList}[ $self->{itemIdx} ]->{volume} = $origtext; + $self->{isIssue} = 0; + } + if ($self->{isSpecialIssueNo}) + { + $self->{specialIssueNo} = $origtext; + $self->{isSpecialIssueNo} = 0; + } + if ($self->{isTitle}) + { + if ($self->{isSpecialIssue} == 2) + { + $self->{itemsList}[ $self->{itemIdx} ]->{volume} = + $self->{specialIssueNo}; + $self->{itemsList}[ $self->{itemIdx} ]->{title} = + $self->{specialTitle} . $origtext; + } + else + { + $self->{itemsList}[ $self->{itemIdx} ]->{title} = $origtext; + } + $self->{isTitle} = 0; + } + if ($self->{isSpecialTitle}) + { + $self->{specialTitle} = $origtext; + $self->{isSpecialTitle} = 0; + } + } + else + { + if ($self->{insideName}) + { + $self->{curInfo}->{series} = $origtext; + #$self->{curInfo}->{series} =~ s/(\s\([0-9]*\))$//; + $self->{insideName} = 0; + } + elsif (($self->{insideNumber}) && ($origtext =~ /^\s*#(\d+)/)) + { + # volume where #XX is in <A HREF... tag, '-' is not + $self->{curInfo}->{volume} = $1; + $self->{insideNumber} = 0; + } + elsif (($self->{insideHeadline}) && ($origtext =~ /-\s#(\d+)/)) + { + # volume where #XX isn't in <A HREF... tag + $self->{curInfo}->{volume} = $1; + $self->{insideNumber} = 0; + } + elsif (($self->{insideHeadline}) && ($origtext =~ /-\s*TPB/)) + { + # Trade paperback + $self->{curInfo}->{series} .= " TPB"; + + # Get volume number. Default to 1. + if ($origtext =~ /vol\. (\d+)/) + { + $self->{curInfo}->{volume} = $1; + } + else + { + $self->{curInfo}->{volume} = 1; + } + $self->{insideNumber} = 0; + } + elsif (($self->{insideHeadline}) && ($origtext =~ /vol\. (\d+)/)) + { + $self->{curInfo}->{volume} = $1; + $self->{insideNumber} = 0; + } + elsif (($self->{insideHeadline}) && ($origtext =~ /-\s*Annual\s*(\d+)/)) + { + # Annual volume where #XX isn't in <A HREF... tag + $self->{curInfo}->{volume} = $1; + $self->{curInfo}->{series} .= " Annual"; + $self->{insideNumber} = 0; + } + elsif (($self->{insideSubHeadline}) && ($origtext =~ /\"(.*)\"/)) + { + $self->{curInfo}->{title} = $1; + + # Get printing or other note if present + if ($origtext =~ /\((.*)\)/) + { + $self->{curInfo}->{title} .= " (" . $1 . ")"; + } + } + elsif ($self->{insidePublisher}) + { + $self->{curInfo}->{publisher} = $origtext; + $self->{insidePublisher} = 0; + } + elsif ($origtext eq "Writer(s):") + { + $self->{nextisWriters} = 1; + $self->{nextisPencillers} = 0; + $self->{nextisColorists} = 0; + } + elsif ($origtext eq "Penciller(s):") + { + $self->{nextisWriters} = 0; + $self->{nextisPencillers} = 1; + $self->{nextisColorists} = 0; + } + elsif ($origtext eq "Colorist(s):") + { + $self->{nextisWriters} = 0; + $self->{nextisPencillers} = 0; + $self->{nextisColorists} = 1; + } + elsif (($origtext eq "Letterer(s):") + || ($origtext eq "Inker(s):") + || ($origtext eq "Editor(s):") + || ($origtext eq "Cover Artist(s):") + || ($origtext eq "Characters:") + || ($origtext eq "Groups:")) + { + $self->{nextisWriters} = 0; + $self->{nextisPencillers} = 0; + $self->{nextisColorists} = 0; + } + elsif ($self->{insideWriters}) + { + if ($self->{curInfo}->{writer} eq "") + { + $self->{curInfo}->{writer} = $origtext; + } + elsif ((index $self->{curInfo}->{writer}, $origtext) == -1) + { + $self->{curInfo}->{writer} .= ", "; + $self->{curInfo}->{writer} .= $origtext; + } + + $self->{insideWriters} = 0; + } + elsif ($self->{insidePencillers}) + { + if ($self->{curInfo}->{illustrator} eq "") + { + $self->{curInfo}->{illustrator} = $origtext; + } + elsif ((index $self->{curInfo}->{illustrator}, $origtext) == -1) + { + $self->{curInfo}->{illustrator} .= ", "; + $self->{curInfo}->{illustrator} .= $origtext; + } + + $self->{insidePencillers} = 0; + } + elsif ($self->{insideColorists}) + { + if ($self->{curInfo}->{colourist} eq "") + { + $self->{curInfo}->{colourist} = $origtext; + } + elsif ((index $self->{curInfo}->{colourist}, $origtext) == -1) + { + $self->{curInfo}->{colourist} .= ", "; + $self->{curInfo}->{colourist} .= $origtext; + } + + $self->{insideColorists} = 0; + } + elsif ($origtext eq "Synopsis: ") + { + $self->{nextisSynopsis} = 1; + } + elsif ($self->{nextisSynopsis}) + { + if ($origtext !~ /None entered./) + { + $self->{curInfo}->{synopsis} = $origtext; + $self->{curInfo}->{synopsis} =~ s/^(\s)*//; + $self->{curInfo}->{synopsis} =~ s/(\s)*$//; + } + $self->{nextisSynopsis} = 0; + } + elsif ($self->{insideCoverDate}) + { + $self->{curInfo}->{printdate} = $origtext; + $self->{curInfo}->{printdate} =~ s/^(\s)*//; + + # Translate date string to date + $self->{curInfo}->{printdate} = + GCUtils::strToTime($self->{curInfo}->{printdate}, "%B %Y"); + $self->{curInfo}->{publishdate} = $self->{curInfo}->{printdate}; + } + } + } + + sub new + { + my $proto = shift; + my $class = ref($proto) || $proto; + my $self = $class->SUPER::new(); + + $self->{ua}->cookie_jar(HTTP::Cookies->new); + + bless($self, $class); + + $self->{isResultsTable} = 0; + $self->{itemIdx} = 0; + $self->{resultsTableColumn} = 0; + $self->{curName} = undef; + $self->{curUrl} = undef; + + return $self; + } + + sub getReturnedFields + { + my $self = shift; + + if ($self->{pass} == 1) + { + $self->{hasField} = {series => 1,}; + } + else + { + $self->{hasField} = { + title => 1, + volume => 1, + }; + } + } + + sub preProcess + { + my ($self, $html) = @_; + + $self->{parsingEnded} = 0; + + return $html; + } + + sub getSearchUrl + { + my ($self, $word) = @_; + + $word =~ s/\+%28\d{4}%29$//; # strip year from end of $word (title) + + # Grab the home page first, or the pages fetched are blank + # (who knows why... must be something funky with the website) + my $response = $ua->get('http://www.comicbookdb.com/'); + + return + "http://www.comicbookdb.com/search.php?form_search=$word&form_searchtype=Title"; + } + + sub getItemUrl + { + my ($self, $url) = @_; + return $url if $url =~ /^http:/; + + return "http://www.comicbookdb.com" . $url; + } + + sub getNumberPasses + { + return 2; + } + + sub getName + { + return "Comic Book DB"; + } + + sub getAuthor + { + return 'Zombiepig'; + } + + sub getLang + { + return 'EN'; + } +} + +1; |