package GCPlugins::GCcomics::GCcomicbookdb; ################################################### # # Copyright 2005-2012 Christian Jodar # # This file is part of GCstar. # # GCstar is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # GCstar is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with GCstar; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA # ################################################### use strict; use utf8; use GCPlugins::GCcomics::GCcomicsCommon; { package GCPlugins::GCcomics::GCPlugincomicbookdb; use LWP::Simple qw($ua); use HTTP::Cookies; use base qw(GCPlugins::GCcomics::GCcomicsPluginsBase); sub start { my ($self, $tagname, $attr, $attrseq, $origtext) = @_; if ($self->{pass} == 1) { # First pass, searching for series name if ($tagname eq "h2") { $self->{isAtResults} = 1; } if ( ($tagname eq "a") && ($self->{isAtResults}) && !($attr->{href} =~ m/ebay\.com/)) { $self->{isCollection} = 1; $self->{itemIdx}++; $self->{itemsList}[ $self->{itemIdx} ]->{nextUrl} = "http://www.comicbookdb.com/" . $attr->{href}; } } else { # Second pass, or fetching item info if ($self->{parsingList}) { if ( ($tagname eq "tbody") && ($self->{isResultsTable}) && ($self->{isSpecialIssue} == 1)) { $self->{isSpecialIssue} = 2; } # Parsing issue list if (($tagname eq "a") && ($self->{isResultsTable})) { if ($attr->{href} =~ m/javascript/) { # Multiple editions of the one issue, need to be # handled differently $self->{isSpecialIssue} = 1; } elsif ($attr->{href} =~ m/storyarc.php/) { # Prevent story arcs from populating lists } elsif ($self->{isSpecialIssue} == 1) { $self->{resultsTableColumn}++; if ($self->{resultsTableColumn} == 1) { $self->{isSpecialIssueNo} = 1; $self->{isIssue} = 1; $self->{itemIdx}++; $self->{itemsList}[ $self->{itemIdx} ]->{url} = "http://www.comicbookdb.com/" . $attr->{href}; } elsif ($self->{resultsTableColumn} == 2) { $self->{isTitle} = 1; $self->{isSpecialTitle} = 1; } } elsif ($self->{isSpecialIssue} == 2) { $self->{itemIdx}++; $self->{itemsList}[ $self->{itemIdx} ]->{url} = "http://www.comicbookdb.com/" . $attr->{href}; $self->{isTitle} = 1; } else { $self->{resultsTableColumn}++; if ($self->{resultsTableColumn} == 1) { $self->{isIssue} = 1; $self->{itemIdx}++; $self->{itemsList}[ $self->{itemIdx} ]->{url} = "http://www.comicbookdb.com/" . $attr->{href}; } elsif ($self->{resultsTableColumn} == 2) { $self->{isTitle} = 1; } } } } else { # Fetching item info if ( ($tagname eq "span") && ((index $attr->{class}, "page_headline") > -1)) { $self->{insideHeadline} = 1; } elsif (($tagname eq "a") && ($self->{insideHeadline}) && ($attr->{href} =~ m/title.php/)) { $self->{insideName} = 1; } elsif (($tagname eq "a") && ($self->{insideHeadline}) && ($attr->{href} =~ m/issue_number.php/)) { $self->{insideNumber} = 1; } elsif (($tagname eq "a") && ($self->{nextisWriters})) { $self->{insideWriters} = 1; $self->{insidePencillers} = 0; $self->{insideColorists} = 0; } elsif (($tagname eq "a") && ($self->{nextisPencillers})) { $self->{insideWriters} = 0; $self->{insidePencillers} = 1; $self->{insideColorists} = 0; } elsif (($tagname eq "a") && ($self->{nextisColorists})) { $self->{insideWriters} = 0; $self->{insidePencillers} = 0; $self->{insideColorists} = 1; } elsif (($tagname eq "a") && ($attr->{href} =~ /imprint.php/)) { $self->{insidePublisher} = 1; } elsif (($tagname eq "a") && ($attr->{href} =~ /publisher.php/) && (!$self->{curInfo}->{publisher})) { $self->{insidePublisher} = 1; } elsif (($tagname eq "a") && ($attr->{href} =~ /coverdate.php/)) { $self->{insideCoverDate} = 1; } if ( ($tagname eq "span") && ((index $attr->{class}, "test") > -1) && ((index $attr->{class}, "page_subheadline") > -1)) { $self->{insideSubHeadline} = 1; } elsif (($tagname eq "a") && ($attr->{href} =~ /^graphics\/comic_graphics\//)) { $self->{curInfo}->{image} = "http://www.comicbookdb.com/" . $attr->{href}; } elsif (($tagname eq "img") && ($attr->{src} =~ /^graphics\/comic_graphics\//) && (!$self->{curInfo}->{image})) { $self->{curInfo}->{image} = "http://www.comicbookdb.com/" . $attr->{src}; } } } } sub end { my ($self, $tagname) = @_; $self->{inside}->{$tagname}--; if ($self->{isResultsTable}) { if ($tagname eq "table") { $self->{isResultsTable} = 0; } elsif ($tagname eq "tr") { $self->{resultsTableColumn} = 0; } } if ($tagname eq "tbody") { $self->{isSpecialIssue} = 0; } elsif ($tagname eq "span") { $self->{insideHeadline} = 0; $self->{insideSubHeadline} = 0; $self->{insideNumber} = 0; } elsif ($tagname eq "td") { $self->{isAtResults} = 0; $self->{nextisWriters} = 0; $self->{nextisPencillers} = 0; $self->{nextisColorists} = 0; $self->{insideWriters} = 0; $self->{insidePencillers} = 0; $self->{insideColorists} = 0; } elsif ($tagname eq "a") { $self->{insidePublisher} = 0; $self->{insideCoverDate} = 0; } } sub text { my ($self, $origtext) = @_; return if ($origtext eq " "); return if ($self->{parsingEnded}); if ($self->{parsingList}) { if ($self->{isCollection}) { $self->{itemsList}[ $self->{itemIdx} ]->{series} = $origtext; $self->{isCollection} = 0; } if ($origtext eq "Cover Date") { $self->{isResultsTable} = 1; } if ($self->{isIssue}) { $self->{itemsList}[ $self->{itemIdx} ]->{volume} = $origtext; $self->{isIssue} = 0; } if ($self->{isSpecialIssueNo}) { $self->{specialIssueNo} = $origtext; $self->{isSpecialIssueNo} = 0; } if ($self->{isTitle}) { if ($self->{isSpecialIssue} == 2) { $self->{itemsList}[ $self->{itemIdx} ]->{volume} = $self->{specialIssueNo}; $self->{itemsList}[ $self->{itemIdx} ]->{title} = $self->{specialTitle} . $origtext; } else { $self->{itemsList}[ $self->{itemIdx} ]->{title} = $origtext; } $self->{isTitle} = 0; } if ($self->{isSpecialTitle}) { $self->{specialTitle} = $origtext; $self->{isSpecialTitle} = 0; } } else { if ($self->{insideName}) { $self->{curInfo}->{series} = $origtext; #$self->{curInfo}->{series} =~ s/(\s\([0-9]*\))$//; $self->{insideName} = 0; } elsif (($self->{insideNumber}) && ($origtext =~ /^\s*#(\d+)/)) { # volume where #XX is in {curInfo}->{volume} = $1; $self->{insideNumber} = 0; } elsif (($self->{insideHeadline}) && ($origtext =~ /-\s#(\d+)/)) { # volume where #XX isn't in {curInfo}->{volume} = $1; $self->{insideNumber} = 0; } elsif (($self->{insideHeadline}) && ($origtext =~ /-\s*TPB/)) { # Trade paperback $self->{curInfo}->{series} .= " TPB"; # Get volume number. Default to 1. if ($origtext =~ /vol\. (\d+)/) { $self->{curInfo}->{volume} = $1; } else { $self->{curInfo}->{volume} = 1; } $self->{insideNumber} = 0; } elsif (($self->{insideHeadline}) && ($origtext =~ /vol\. (\d+)/)) { $self->{curInfo}->{volume} = $1; $self->{insideNumber} = 0; } elsif (($self->{insideHeadline}) && ($origtext =~ /-\s*Annual\s*(\d+)/)) { # Annual volume where #XX isn't in {curInfo}->{volume} = $1; $self->{curInfo}->{series} .= " Annual"; $self->{insideNumber} = 0; } elsif (($self->{insideSubHeadline}) && ($origtext =~ /\"(.*)\"/)) { $self->{curInfo}->{title} = $1; # Get printing or other note if present if ($origtext =~ /\((.*)\)/) { $self->{curInfo}->{title} .= " (" . $1 . ")"; } } elsif ($self->{insidePublisher}) { $self->{curInfo}->{publisher} = $origtext; $self->{insidePublisher} = 0; } elsif ($origtext eq "Writer(s):") { $self->{nextisWriters} = 1; $self->{nextisPencillers} = 0; $self->{nextisColorists} = 0; } elsif ($origtext eq "Penciller(s):") { $self->{nextisWriters} = 0; $self->{nextisPencillers} = 1; $self->{nextisColorists} = 0; } elsif ($origtext eq "Colorist(s):") { $self->{nextisWriters} = 0; $self->{nextisPencillers} = 0; $self->{nextisColorists} = 1; } elsif (($origtext eq "Letterer(s):") || ($origtext eq "Inker(s):") || ($origtext eq "Editor(s):") || ($origtext eq "Cover Artist(s):") || ($origtext eq "Characters:") || ($origtext eq "Groups:")) { $self->{nextisWriters} = 0; $self->{nextisPencillers} = 0; $self->{nextisColorists} = 0; } elsif ($self->{insideWriters}) { if ($self->{curInfo}->{writer} eq "") { $self->{curInfo}->{writer} = $origtext; } elsif ((index $self->{curInfo}->{writer}, $origtext) == -1) { $self->{curInfo}->{writer} .= ", "; $self->{curInfo}->{writer} .= $origtext; } $self->{insideWriters} = 0; } elsif ($self->{insidePencillers}) { if ($self->{curInfo}->{illustrator} eq "") { $self->{curInfo}->{illustrator} = $origtext; } elsif ((index $self->{curInfo}->{illustrator}, $origtext) == -1) { $self->{curInfo}->{illustrator} .= ", "; $self->{curInfo}->{illustrator} .= $origtext; } $self->{insidePencillers} = 0; } elsif ($self->{insideColorists}) { if ($self->{curInfo}->{colourist} eq "") { $self->{curInfo}->{colourist} = $origtext; } elsif ((index $self->{curInfo}->{colourist}, $origtext) == -1) { $self->{curInfo}->{colourist} .= ", "; $self->{curInfo}->{colourist} .= $origtext; } $self->{insideColorists} = 0; } elsif ($origtext eq "Synopsis: ") { $self->{nextisSynopsis} = 1; } elsif ($self->{nextisSynopsis}) { if ($origtext !~ /None entered./) { $self->{curInfo}->{synopsis} = $origtext; $self->{curInfo}->{synopsis} =~ s/^(\s)*//; $self->{curInfo}->{synopsis} =~ s/(\s)*$//; } $self->{nextisSynopsis} = 0; } elsif ($self->{insideCoverDate}) { $self->{curInfo}->{printdate} = $origtext; $self->{curInfo}->{printdate} =~ s/^(\s)*//; # Translate date string to date $self->{curInfo}->{printdate} = GCUtils::strToTime($self->{curInfo}->{printdate}, "%B %Y"); $self->{curInfo}->{publishdate} = $self->{curInfo}->{printdate}; } } } sub new { my $proto = shift; my $class = ref($proto) || $proto; my $self = $class->SUPER::new(); $self->{ua}->cookie_jar(HTTP::Cookies->new); bless($self, $class); $self->{isResultsTable} = 0; $self->{itemIdx} = 0; $self->{resultsTableColumn} = 0; $self->{curName} = undef; $self->{curUrl} = undef; return $self; } sub getReturnedFields { my $self = shift; if ($self->{pass} == 1) { $self->{hasField} = {series => 1,}; } else { $self->{hasField} = { title => 1, volume => 1, }; } } sub preProcess { my ($self, $html) = @_; $self->{parsingEnded} = 0; return $html; } sub getSearchUrl { my ($self, $word) = @_; $word =~ s/\+%28\d{4}%29$//; # strip year from end of $word (title) # Grab the home page first, or the pages fetched are blank # (who knows why... must be something funky with the website) my $response = $ua->get('http://www.comicbookdb.com/'); return "http://www.comicbookdb.com/search.php?form_search=$word&form_searchtype=Title"; } sub getItemUrl { my ($self, $url) = @_; return $url if $url =~ /^http:/; return "http://www.comicbookdb.com" . $url; } sub getNumberPasses { return 2; } sub getName { return "Comic Book DB"; } sub getAuthor { return 'Zombiepig'; } sub getLang { return 'EN'; } } 1;