From 126bb8cb6b93240bb4d3a2b816b74c286c3d422b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Sun, 6 Jul 2014 15:20:38 +0200 Subject: Imported Upstream version 1.7.0 --- lib/gcstar/GCPlugins/GCfilms/GCCsfd.pm | 699 +++++++++++++++++++++++++++++++++ 1 file changed, 699 insertions(+) create mode 100644 lib/gcstar/GCPlugins/GCfilms/GCCsfd.pm (limited to 'lib/gcstar/GCPlugins/GCfilms/GCCsfd.pm') diff --git a/lib/gcstar/GCPlugins/GCfilms/GCCsfd.pm b/lib/gcstar/GCPlugins/GCfilms/GCCsfd.pm new file mode 100644 index 0000000..ea84b45 --- /dev/null +++ b/lib/gcstar/GCPlugins/GCfilms/GCCsfd.pm @@ -0,0 +1,699 @@ +# Replace SiteTemplate with your plugin name. +# The package name must exactly match the file name (.pm) +package GCPlugins::GCfilms::GCCsfd; + +################################################### +# +# Copyright 2005-2009 Tian +# Copyright 2007,2011 Petr Gajdůšek +# +# This file is part of GCstar. +# +# GCstar is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# GCstar is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCstar; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA +# +################################################### + +use strict; +#use warnings; +use utf8; + +use GCPlugins::GCfilms::GCfilmsCommon; + +{ + + # Replace SiteTemplate with your exporter name + # It must be the same name as the one used for file and main package name + package GCPlugins::GCfilms::GCPluginCsfd; + + use base qw(GCPlugins::GCfilms::GCfilmsPluginsBase); + + # getSearchCharset + # Charset of search term + sub getSearchCharset + { + return 'UTF-8'; + } + + # getSearchUrl + # Used to get the URL that to be used to perform searches. + # $word is the query + # Returns the full URL. + sub getSearchUrl + { + my ($self, $word) = @_; + return "http://www.csfd.cz/hledat/?q=$word"; + } + + # getItemUrl + # Used to get the full URL of a movie page. + # Useful when url on results pages are relative. + # $url is the URL as found with a search. + # Returns the absolute URL. + sub getItemUrl + { + my ($self, $url) = @_; + + $url = "http://www.csfd.cz" . $url if ($url !~ /^http:/); + return $url; + } + + # getCharset + # Used to convert charset in web pages. + # Returns the charset as specified in pages. + #sub getCharset { + # my $self = shift; + # + # return "UTF-8"; + #} + + # getName + # Used to display plugin name in GUI. + # Returns the plugin name. + sub getName + { + return "CSFD.cz"; + } + + # getAuthor + # Used to display the plugin author in GUI. + # Returns the plugin author name. + sub getAuthor + { + return 'Petr Gajdůšek'; + } + + # getLang + # Used to fill in plugin list with user language plugins + # Return the language used for this site (2 letters code). + sub getLang + { + return 'CS'; + } + + # hasSearchYear + # Used to hide year column in search results + # Return 0 to hide column, 1 to show it. + sub hasSearchYear + { + return 1; + } + + # hasSearchDirector + # Used to hide director column in search results + # Return 0 to hide column, 1 to show it. + sub hasSearchDirector + { + return 1; + } + + # hasSearchActors + # Used to hide actors column in search results + # Return 0 to hide column, 1 to show it. + sub hasSearchActors + { + return 1; + } + + # getExtra + # Used if the plugin wants an extra column to be displayed in search results + # Return the column title or empty string to hide the column. + sub getExtra + { + + return 'Žánr'; + } + + # changeUrl + # Can be used to change URL if movie URL and the one used to + # extract information are different. + # Return the modified URL. + sub changeUrl + { + my ($self, $url) = @_; + + return $url; + } + + # preProcess + # Called before each page is processed. You can use it to do some substitutions. + # $html is the page content. + # Returns modified version of page content. + sub preProcess + { + my ($self, $html) = @_; + $self->{parsingEnded} = 0; + if ($self->{parsingList}) + { + # Search results + + # Initial values for search results parsing + # There are two movies list: + # First with detailed info (title, genre, origin country, year, directors, actors) + # Second with brief list of other movies (title, year) + + # We are in brief list containing other movies without details + $self->{insideOtherMovies} = 0; + # Movie link; movie's details follow if not in brief list + $self->{isMovie} = 0; + + ## Details: + + # Movie's details will follow: Genre, origin, actors, directors, year + $self->{insideDetails} = 0; + # In movie's details after paragraph with Genre, origin and date + $self->{wasDetailsInfo} = 0; + # In movie's details: directors and actors + $self->{directors} = (); + $self->{directorsCounter} = 0; + $self->{actors} = (); + $self->{actorsCounter} = 0; + $self->{insideDirectors} = 0; + $self->{insideActors} = 0; + + # Movie year + $self->{isYear} = 0; + + ## Preprocess + + # directors and actors + $html =~ s/\n\s*Režie:\s([^\n]*)/
$1<\/div>/g; + $html =~ s/\n\s*Hrají:\s([^\n].*)/
$1<\/div>/g; + # year + $html =~ s/\(([0-9]+)\)<\/span>/$1<\/span>/g; + } + else + { + # Movie page + + # Initial values for search results parsing + + # array containg other movie titles (not exported to GCStar) + $self->{titles} = (); + # in list containing other movie titles + $self->{isTitles} = 0; + # in the original title (title for same country as movie's origin) + $self->{isOrigTitle} = 0; + # original title (if not set during parsing it will be set to main title at the end) + $self->{origTitle} = undef; + $self->{titlesCounter} = 0; + + $self->{insideGenre} = 0; + + $self->{awaitingSynopsis} = 0; + $self->{insideSynopsis} = 0; + + # inside details with country, date (year) and time (length) + $self->{insideInfo} = 0; + + $self->{insideRating} = 0; + + # User comments + # Each comment consists of commenter (user) and his comment + + $self->{insideCommentAuthor} = 0; + $self->{awaitingComment} = 0; + $self->{insideComment} = 0; + + # In directors and actors + $self->{insideDirectors} = 0; + $self->{insideActors} = 0; + $self->{directors} = (); + $self->{directorsCounter} = 0; + $self->{actors} = (); + $self->{actorsCounter} = 0; + + ## Preprocess + + # removee
and
+ $html =~ s//\n/g; + ## Synopsis + # remove list bullet + $html =~ s/]*>//g; + # remove hyperlink to user profile + $html =~ s/( /\n-- $1/g; + $html =~ s/
([^<]*)<\/div>/$1/g; + } + return $html; + } + + # In processing functions below, self->{parsingList} can be used. + # If true, we are processing a search results page + # If false, we are processing a movie information page. + + # $self->{inside}->{tagname} (with correct value for tagname) can be used to test + # if we are in the corresponding tag. + + # You have a counter $self->{movieIdx} that have to be used when processing search results. + # It is your responsability to increment it! + + # When processing search results, you have to fill (if available) following fields: + # + # $self->{movieList}[$self->{movieIdx}]->{title} + # $self->{movieList}[$self->{movieIdx}]->{url} + # $self->{movieList}[$self->{movieIdx}]->{actors} + # $self->{movieList}[$self->{movieIdx}]->{director} + # $self->{movieList}[$self->{movieIdx}]->{date} + # $self->{movieList}[$self->{movieIdx}]->{extra} + + # When processing a movie page, you need to fill the fields (if available) in $self->{curInfo}. They are: + # + # $self->{curInfo}->{title} + # $self->{curInfo}->{director} + # $self->{curInfo}->{original} (Original title) + # $self->{curInfo}->{actors} + # $self->{curInfo}->{genre} (Comma separated list of movie type) + # $self->{curInfo}->{country} (Movie Nationality or country) + # $self->{curInfo}->{date} + # $self->{curInfo}->{time} + # $self->{curInfo}->{synopsis} + # $self->{curInfo}->{image} + # $self->{curInfo}->{audio} + # $self->{curInfo}->{subt} + # $self->{curInfo}->{age} 0 : No information + # 1 : Unrated + # 2 : All audience + # 5 : Parental Guidance + # >= 10 : Minimum age value + + # start + # Called each time a new HTML tag begins. + # $tagname is the tag name. + # $attr is reference to an associative array of tag attributes. + # $attrseq is an array reference containing all the attributes name. + # $origtext is the tag text as found in source file + # Returns nothing + sub start + { + my ($self, $tagname, $attr, $attrseq, $origtext) = @_; + $self->{inside}->{$tagname}++; + + + if ($self->{parsingEnded}) + { + return; + } + + if ($self->{parsingList}) + { + + # in brief list of other movies (without details) + if ($tagname eq "ul" and $attr->{class} eq "films others") + { + $self->{insideOtherMovies} = 1; + } + + # in link to movie page + if ($tagname eq "a" and $attr->{href} =~ m/\/film\/[0-9]+-.*/) + { + $self->{isMovie} = 1; + $self->{itemIdx}++; + $self->{itemsList}[ $self->{itemIdx} ]->{url} = $attr->{href}; + $self->{insideDetails} = 1 if ($self->{insideOtherMovies} != 1); + $self->{wasDetailsInfo} = 0; + } + + # directors and actors + if ($tagname eq "div") + { + $self->{insideDirectors} = 1 if ($attr->{class} eq "directors"); + $self->{insideActors} = 1 if ($attr->{class} eq "actors"); + } + + # year + if ($tagname eq "span") + { + $self->{isYear} = 1 if ($attr->{class} eq "film-year"); + } + } + else + { + + # Synopsis + if ( $tagname eq "div" + and $attr->{class} eq "content" + and $self->{awaitingSynopsis}) + { + $self->{insideSynopsis} = 1; + $self->{awaitingSynopsis} = 0; + } + + # Poster + if ( $tagname eq "img" + and $attr->{src} =~ /^http:\/\/img\.csfd\.cz\/posters\//) + { + $self->{curInfo}->{image} = $attr->{src}; + } + + # Original name and other names + if ($tagname eq "ul" and $attr->{class} eq "names") + { + $self->{isTitles} = 1; + } + + if ($tagname eq "img" and $self->{isTitles}) + { + $self->{isOrigTitle} = 1 if ($attr->{alt} !~ /název$/); + $self->{isSKTitle} = 1 if ($attr->{alt} =~ /SK název$/); + } + + # Genre + if ($tagname eq "p" and $attr->{class} eq "genre") + { + $self->{insideGenre} = 1; + } + + # Info (country ,date, time = duration) + if ($tagname eq "p" and $attr->{class} eq "origin") + { + $self->{insideInfo} = 1; + } + + # Rating + if ($tagname eq "h2" and $attr->{class} eq "average") + { + $self->{insideRating} = 1; + } + + # Comments + if ($tagname eq "h5" and $attr->{class} eq "author") + { + $self->{insideCommentAuthor} = 1; + } + if ($self->{awaitingComment} and $tagname eq "p" and $attr->{class} eq "post") + { + $self->{awaitingComment} = 0; + $self->{insideComment} = 1; + } + + } + } + + # end + # Called each time a HTML tag ends. + # $tagname is the tag name. + sub end + { + my ($self, $tagname) = @_; + $self->{inside}->{$tagname}--; + + if ($self->{parsingList}) + { + + # movie details + $self->{insideDetails} = 0 + if ($tagname eq "div") + and $self->{insideDetails}; + + # directors and actors + if ($tagname eq "div") + { + if ($self->{insideDirectors}) + { + $self->{insideDirectors} = 0; + $self->{itemsList}[ $self->{itemIdx} ]->{director} = + join(', ', @{$self->{directors}}); + $self->{directors} = (); + $self->{directorsCounter} = 0; + } + if ($self->{insideActors}) + { + $self->{insideActors} = 0; + $self->{itemsList}[ $self->{itemIdx} ]->{actors} = + join(', ', @{$self->{actors}}); + $self->{actors} = (); + $self->{actorsCounter} = 0; + } + } + } + else + { + + # Synopsis + $self->{insideSynopsis} = 0 if ($tagname eq "div"); + + # Titles + if ($tagname eq "ul" and $self->{isTitles}) + { + $self->{isTitles} = 0; + } + + if ( $tagname eq "body" ) + { + $self->{curInfo}->{original} ||= $self->{curInfo}->{title}; + } + + # Actors + if ($tagname eq "div" and $self->{insideActors}) + { + $self->{curInfo}->{actors} = join(', ', @{$self->{actors}}); + $self->{insideActors} = 0; + } + + # Directors + if ($tagname eq "div" and $self->{insideDirectors}) + { + $self->{curInfo}->{director} = join(', ', @{$self->{directors}}); + $self->{insideDirectors} = 0; + } + + # Comment + + $self->{insideCommentAuthor} = 0 + if ($tagname eq "h5" and $self->{insideCommentAuthor}); + + if ($tagname eq "li" and $self->{isComment}) + { + $self->{curInfo}->{comment} .= "\n"; + $self->{isComment} = 0; + } + + # Debug + if ($tagname eq "body" and $self->{debug}) + { + use Data::Dumper; + print Dumper $self->{curInfo}; + } + } + } + + # text + # Called each time some plain text (between tags) is processed. + # $origtext is the read text. + sub text + { + my ($self, $origtext) = @_; + + return if length($origtext) < 2; + $origtext =~ s/^\s+|\s+$//g; + + return if ($self->{parsingEnded}); + + if ($self->{parsingList}) + { + if ($self->{inside}->{h1} && $origtext !~ m/Vyhledávání/i) + { + $self->{parsingEnded} = 1; + $self->{itemIdx} = 0; + $self->{itemsList}[0]->{url} = $self->{loadedUrl}; + } + + # Movie title + if ($self->{isMovie}) + { + $self->{itemsList}[ $self->{itemIdx} ]->{"title"} = $origtext; + $self->{isMovie} = 0; + return; + } + + # Date (year) + elsif ($self->{isYear}) + { + $self->{itemsList}[ $self->{itemIdx} ]->{"date"} = $origtext; + $self->{isYear} = 0; + } + + # Extra movie info: genre, origin, date + elsif ( $self->{inside}->{p} + and $self->{insideDetails} + and $self->{wasDetailsInfo} == 0) + { + my @tmp = split(', ', $origtext); + my $pos = $#tmp; + my ($year, $country, $genre) = (undef, undef, undef); + $year = $tmp[$pos] if ($tmp[$pos] =~ /^\d+$/); + $pos--; + $country = $tmp[$pos] if ($pos >= 0); + $pos--; + $genre = $tmp[$pos] if ($pos >= 0); + + $self->{itemsList}[ $self->{itemIdx} ]->{date} = $year if (defined $year); + $self->{itemsList}[ $self->{itemIdx} ]->{country} = $country + if (defined $country); + $self->{itemsList}[ $self->{itemIdx} ]->{extra} = $genre + if (defined $genre); + $self->{wasDetailsInfo} = 1; + } + + # Directors + elsif ($self->{inside}->{a} and $self->{insideDirectors}) + { + push @{$self->{directors}}, $origtext; + $self->{directorsCounter}++; + } + + # Actors + elsif ($self->{inside}->{a} and $self->{insideActors}) + { + push @{$self->{actors}}, $origtext; + $self->{actorsCounter}++; + } + } + else + { + + # Movie titles + if ($self->{inside}->{h1}) + { + $self->{curInfo}->{title} = $origtext + if !$self->{curInfo}->{title}; + } + if ($self->{inside}->{h3} and $self->{isTitles}) + { + $self->{titlesCounter}++; + $self->{titles}[ $self->{titlesCounter} ] = $origtext; + if ($self->{isOrigTitle}) + { + $self->{curInfo}->{original} ||= $origtext; + $self->{isOrigTitle} = 0; + } + if ($self->{isSKTitle} and $self->{lang} eq "SK") + { + $self->{curInfo}->{title} = $origtext; + $self->{isSKTitle} = 0; + } + } + + # Genre + if ($self->{insideGenre}) + { + $origtext =~ s/ \/ /,/g; + $self->{curInfo}->{genre} = $origtext; + $self->{insideGenre} = 0; + } + + # Extra movie info: country, date (year), time + if ($self->{insideInfo}) + { + my ($country, $year, $time) = split(', ', $origtext); + $country =~ s/ \/ /,/g; + + $self->{curInfo}->{country} = $country; + $self->{curInfo}->{date} = $year; + $self->{curInfo}->{time} = $time; + + $self->{insideInfo} = 0; + } + + # Directors and Actors + if ($self->{inside}->{h4}) + { + $self->{insideDirectors} = 1 if ($origtext =~ /^Režie:/); + $self->{insideActors} = 1 if ($origtext =~ /^Hrají:/); + } + + if ($self->{inside}->{a} and $self->{insideDirectors}) + { + push @{$self->{directors}}, $origtext; + $self->{directorsCounter}++; + } + if ($self->{inside}->{a} and $self->{insideActors}) + { + #push @{$self->{curInfo}->{actors}}, [$origtext] + # if ($self->{actorsCounter} < + # $GCPlugins::GCfilms::GCfilmsCommon::MAX_ACTORS); + #$self->{actorsCounter}++; + push @{$self->{actors}}, $origtext; + $self->{actorsCounter}++; + } + + # Synopsis + if ($self->{inside}->{h3}) + { + $self->{awaitingSynopsis} = 1 if ($origtext eq "Obsah"); + } + if ($self->{inside}->{li} and $self->{insideSynopsis}) + { + $self->{curInfo}->{synopsis} .= $origtext . "\n\n\n"; + } + + # Rating + if ($self->{insideRating}) + { + $origtext =~ s/([0-9]+)%/$1/; + $self->{curInfo}->{ratingpress} = int($origtext / 10 + .5) + if ($origtext ne ""); + $self->{insideRating} = 0; + } + + # Comments + if ($self->{inside}->{a} and $self->{insideCommentAuthor}) + { + $self->{curInfo}->{comment} .= $origtext . " napsal(a):\n"; + $self->{awaitingComment} = 1; + } + if ($self->{insideComment}) + { + $self->{curInfo}->{comment} .= $origtext . "\n\n"; + $self->{insideComment} = 0; + } + } + } + + # new + # Constructor. + # Returns object reference. + sub new + { + my $proto = shift; + my $class = ref($proto) || $proto; + my $self = $class->SUPER::new(); + + # Do your init stuff here + + bless($self, $class); + + $self->{hasField} = { + title => 1, + date => 1, + director => 1, + actors => 1, + country => 1 + }; + + $self->{lang} = "CS"; + + $self->{curName} = undef; + $self->{curUrl} = undef; + + $self->{debug} = ($ENV{GCS_DEBUG_PLUGIN_PHASE} > 0); + + return $self; + } + +} + +1; -- cgit v1.2.3