#!/usr/bin/perl -l # # This script retrieve book information from google # # (c) 2009 by Miek Gieben # GPL licensed use LWP::UserAgent; use HTML::Scrubber; use warnings; use strict; die "Need an ISBN number" if !$ARGV[0]; my $scrub = HTML::Scrubber->new( allow => [ qw/div/ ] ); my $ua = LWP::UserAgent->new( agent => "Links (2.2; Linux 2.6.29-02062904-generic i686; 100x40" ); my $resp = $ua->get( "http://books.google.com/books?q=$ARGV[0]" ); my @divs = split /<.?div>/, $scrub->scrub($resp->content); #print "@divs"; my ($details, $i); $i = 0; foreach (@divs) { if ( /cover view/i../about this book/i ) { if (length($_) == 0) { $i++; } else { # after some whitespace we get to the goodies # 5 seems to work ok if ($i > 5) { $details = $_; } $i = 0; } } } if (!$details) { exit; } my ($title, $rest) = split /;/, $details; my ($author, $genre, $year) = split / - /, $rest; $title =~ s/&#....$//; $author =~ s/By //i; $author =~ s/,.*$//; # we might not have gotten a genre, then genre holds # the year of publishing if ($genre && $genre =~ /\d+/) { $year = $genre; $genre = "none"; } $genre = lc $genre; my %genre = ( none => 'none', misc => 'misc', 'comics & graphic novels' => 'comics', games => 'comics', 'language arts & disciplines' => 'art', art => 'art', music => 'art', 'sports & recreation' => 'recreation', cooking => 'cooking', literature => 'literature', mathematics => 'science', technology => 'science', science => 'science', 'political science' => 'science', 'social science' => 'science', 'business & economics' => 'science', psychology => 'science', 'computer viruses' => 'computer', computers => 'computer', 'computer programming' => 'computer', 'juvenile fiction' => 'fiction', fiction => 'fiction', humor => 'fiction', 'science fiction' => 'science fiction', fantasy => 'fantasy', dictionary => 'encyclopedia', travel => 'travel' ); print $title; print $author; print $genre{$genre} if $genre{$genre}; print "none" if ! $genre{$genre}; print $year if $year; print "0" if !$year; print "#", $details;