#!/usr/bin/perl
use utf8;

use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Request::Common qw(GET);
use HTTP::Date qw(parse_date time2str);
use CGI;
use CGI qw(:standard);

use Encode;
use Encode qw(from_to);
use Encode::Guess;

use DBI;

my($dsn)='DBI:mysql:news:localhost';
my($dbUser)='news';
my($dbPass)='copernicus';
my($dbh);

my($debug) = 0;
my($force_to_load) = $debug * 1;

use HTML::TreeBuilder;

#print &bodyparse_asahi("http://www.asahi.com/international/update/0918/TKY201009180359.html");


&main;

sub main
{
	$dbh = DBI->connect($dsn, $dbUser, $dbPass);

#	my($RSS_asahi) = &getPage('http://feeds.asahi.com/asahi/TopHeadlines');
#	my($RSS_mainichi) = &getPage('http://mainichi.jp/rss/etc/flash.rss');
	
	my(@RSSs) = (
		{
			source	=> 'asahi',
			url 	=> 'http://feeds.asahi.com/asahi/TopHeadlines',
#		},
#		{
#			source	=> 'mainichi',
#			url 	=> 'http://mainichi.jp/rss/etc/flash.rss',
		}
	);


	foreach $rss (@RSSs){
		$rss->{rss} = &getPage($rss->{url});
		$rss->{date} = &getDate($rss->{rss});
		
		if (&checkRSS_log($dbh, $rss) == 0)
		{
			print "Start registering RSS\n";
			my(@articles) = &readRSS($rss->{rss});
			#foreach $article (@articles)
			for(my $i=0; $i<@articles; $i++)
			{
			#	print $articles[$i]->{title}."<<<<<<<<<<<<<\n";
#			print ">$i";
				if(&checkURL($dbh, $articles[$i]->{url}) == 0)
				{
					#print $articles[$i]->{title}."\n";
#			print ">$i";
					if(&checkTITLE($dbh, $articles[$i]->{title}) == 0)
					{
#			print ">$i";
						if($articles[$i]->{url} =~ /$rss->{source}/){
							$articles[$i]->{body} =  &getBody($rss->{source}, $articles[$i]->{url});
#							print $articles[$i]->{url}."\n";
						
							if ($articles[$i]->{body} ne '')
							{

#			print ">$i";
								&registArticle($dbh, $articles[$i]);
							}
						}
					}
				}
			}

		} else {
			print "Already Checked\n";
		}

	}
	
	&halt;	

}

sub checkRSS_log{
	my($handle) = $_[0];
	my($rss) = $_[1];
	
	my($timestamp) = $rss->{date};
	my($source) = $rss->{source};
	
	my($year, $month, $day, $hour, $min, $sec, $tz) = parse_date($timestamp);
	if(!(defined($year))){
		return 1;
	}
	my($datestr) = sprintf("%04d-%02d-%02dT%02d:%02d:%02d", $year, $month, $day, $hour, $min, $sec);
	$tz =~ s/^\+//;

	my($rows) = $handle->selectrow_array("SELECT COUNT(*) FROM RSS_log WHERE date=\"$datestr\" AND offset=\"$tz\" AND source=\"$source\"");
	if ($rows == 0 or $debug == 1)
	{
		if ($debug != 1){
			$handle->do("INSERT INTO RSS_log SET date=\"$datestr\", offset=\"$tz\", source=\"$source\"");
		} else {
			print "INSERT INTO RSS_log SET date=\"$datestr\", offset=\"$tz\", source=\"$source\"";
		}
		return 0;
	}
	else
	{
		return 1;
	} 
}

sub checkURL{
	my($handle) = $_[0];
	my($url) = $_[1];

#	print $url."\n";

	my($rows) = $handle->selectrow_array("SELECT COUNT(*) FROM articles WHERE url=\"$url\"");
	if ($rows == 0 or $debug == 1)
	{
		return 0;
	}
	else
	{
		return 1;
	} 
}

sub checkTITLE{
	my($handle) = $_[0];
	my($title) = $_[1];

	my($rows) = $handle->selectrow_array("SELECT COUNT(*) FROM articles WHERE url=\"$title\"");
	if ($rows == 0 or $debug == 1)
	{
		return 0;
	}
	else
	{
		return 1;
	} 
}

sub registArticle{
	my($handle) = $_[0];
	my($hash) = $_[1];
	
	my($year, $month, $day, $hour, $min, $sec, $tz) = parse_date($hash->{date});
	my($datestr) = sprintf("%04d-%02d-%02dT%02d:%02d:%02d", $year, $month, $day, $hour, $min, $sec);
	$tz =~ s/^\+//;
	
	my($s_id) = &registSubject($handle, $hash->{subject});

	my($query) = "INSERT INTO articles SET url=\"$hash->{url}\", date=\"$datestr\", offset=\"$tz\", s_id=\"$s_id\", title=\"$hash->{title}\", body=\"$hash->{body}\"";

	if ($debug != 1){
		$handle->do($query);

		my($article_id) = $handle->selectrow_array("SELECT a_id FROM articles WHERE url=\"$hash->{url}\"");
		$hash->{'article_id'} = $article_id;
	} else {
#		print $query."\n\n";
	}

	return $hash;
}

sub registSubject
{
	my($handle) = $_[0];
	my($subject) = $_[1];

	my($row) = $handle->selectrow_array("SELECT COUNT(*) FROM subjects WHERE subject=\"$subject\"");
	if ($row == 0)
	{
		my($query) = "INSERT INTO subjects SET subject=\"$subject\"";
		$handle->do($query);
	}
	
	my($s_id) =  $handle->selectrow_array("SELECT s_id FROM subjects WHERE subject=\"$subject\"");
	
	return $s_id;
	
}

sub getPage{
	my($url) = $_[0];
	my $request  = GET($url);

	# UserAgentを生成して処理
	my $ua = LWP::UserAgent->new;
	$ua->agent('Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; ja-jp) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1');

	my $res = $ua->request($request);
	
	if ($res->is_success) {
		my($body) = $res->content;
		my($decoder)='';
		
		if ($body =~ m/(?<=charset=)[-\w]+/){
			my($charset) = lc($&);
			if ($charset eq 'euc-jp'){
				$decoder = 'euc-jp';
			} elsif ($charset eq 'shift-jis'){
				$decoder = 'shift-jis'
			} elsif ($charset eq 'utf-8'){
				$decoder = 'utf-8';
			}
		}
		
		$decoder = ($decoder ne '')? $decoder: 'utf-8';


		if ($decoder ne ''){
			from_to($body, $decoder, 'utf-8');
			$body =~ s/$decoder/utf-8/gi;
		}
		utf8::decode($body);

		$body =~ s/"/\\"/g;

		return $body;
	} else { 
		die "$res->status_line\n";
	}

}

sub readRSS{
	my($RSS) = $_[0];
	my(@ARRAY);

	my(@items);
	while($RSS =~ m|<item[>\s].*?</item>|gs ){
		push(@items, $&);
	}

	my($title, $url, $subject, $date);
	my($i);
#	print @items;
	for($i=0; $i<@items; $i++){
#print "@@@@@\n";
		if($items[$i] =~ m|(?<=<title>).*?(?=</title>)|){
			$title = $&;
			$title =~ s/"/\\"/g;
			
			if ($title =~ /<!\[CDATA\[AD:.+\]\]>/){
				next;
			}
		}
		if($items[$i] =~ m|(?<=<link>).*?(?=</link>)|){
			$url = $&;
		}
		if($items[$i] =~ m|(?<=<dc:subject>).*?(?=</dc:subject>)|){
			$subject = $&;
			$subject =~ s/"/\\"/g;
		}
		if($items[$i] =~ m|(?<=<dc:date>)[-+T:\d]*?(?=</dc:date>)|){
			$date = $&;
		}
	
		my($hash) = {
			title	=> $title,
			url 	=> $url,
			subject	=> $subject,
			date	=> $date
		};
#		print ">$title<\n";

		$title = "";
		$url = "";
		$subject = "";
		$date = "";	

		push(@ARRAY, $hash);
	}
	return (@ARRAY);
}

sub getBody
{
	my($source) = $_[0];
	my($url) = $_[1];
	my($body) = '';
	
	if ($source eq 'asahi')
	{
		$body = &bodyparse_asahi($url);
	}
	elsif ($source eq 'mainichi')
	{
		$body = &bodyparse_mainichi($url);

	}

	return $body;
}

sub bodyparse_asahi{
	my($url) = $_[0];
	my($file) = '';
	my($filepath) = '';
	my($rootfile) = 0;
	
	if ($url =~ m|(.+/)([^/_]+)(?=\.html)|){
		$filepath = $1;	
		$file = $2;
		$rootfile = 1;
	}

	my($body) = &getPage($url);



	my($entrypart) = '';
	
	if ($body ne ''){
		# Asahi用
		my $tree = new HTML::TreeBuilder;
		$tree->parse($body);
		$tree->eof;

		my($bodytext) = $tree->look_down('class','\"BodyTxt\"');

		if (!(defined(%{$bodytext}))){
			return "";
		}
		
		foreach my $p ($bodytext->find('p')){
			my($ptext) = $p->as_trimmed_text;
			$ptext =~ s|\s||ig; 
			if ($ptext ne ''){
				$entrypart .= "<p>$ptext</p>\n";
			}
		}

		my($imagetable)='';
		foreach my $tag($tree->look_down('_tag' => 'td', 'class' =>'\"Phot\"')){
			foreach my $img($tag->look_down('_tag' => 'img')){
				if (!(defined($img->attr('class'))))
				{
					my($alt, $src) = ($img->attr('alt'), $img->attr('src'));
					$alt =~ s|\s||ig;
					$alt =~ s|：(?=\\")||;
					#$src =~ s|\\"|\\"http://www.asahi.com|;
					$imagetable .= "<tr><td><img src=$src alt=$alt /></td></tr>";
				}
			}
		}

		$imagetable = ($imagetable ne '')? "<table>$imagetable</table>\n": '';
		$entrypart = $imagetable.$entrypart;

		my (@nexts);
		if($rootfile == 1){
			my ($regex) = $file.'_\d{2}';
			foreach my $tag($tree->look_down('_tag' => 'a', 'href' => qr/$regex/)){
				if ($tag->as_text =~ m|\d+|){
					if ($tag->attr('href') =~ m|(?<=\\").+(?=\\")|){
						push(@nexts, "$filepath$&");
					}
				}
			}
		}

		$tree = $tree->delete;

		foreach $next (@nexts){
			$entrypart .= &bodyparse_asahi($next);
		}
	}

	return $entrypart;
}

sub bodyparse_mainichi{
	my($url) = $_[0];

	my($body) = &getPage($url);
	
	my($entrypart)='';

	if ($body ne ''){
		# Mainichi用
		my $tree = new HTML::TreeBuilder;
		$tree->parse($body);
		$tree->eof;

		my($bodytext) = $tree->look_down('class','\"NewsBody\"');
		
		if (!(defined(%{$bodytext}))){
			return "";
		}		

		foreach my $p ($bodytext->find('p')){
			my($ptext) = $p->as_trimmed_text;
			$ptext =~ s|\s||ig; 
			if ($ptext ne ''){
				$entrypart .= "<p>$ptext</p>\n";
			}
		}

		my($imagetable)='';
		foreach my $img($bodytext->look_down('_tag' => 'img')){
			if (!(defined($img->attr('class'))))
			{
				my($alt, $src) = ($img->attr('alt'), $img->attr('src'));
				$alt =~ s|\s||ig;
				$alt =~ s|：(?=\\")||;
				$src =~ s|\\"|\\"http://mainichi.jp|;
				$imagetable .= "<tr><td><img src=$src alt=$alt /></td></tr>";
			}
		}

		$imagetable = ($imagetable ne '')? "<table>$imagetable</table>\n": '';
		$entrypart = $imagetable.$entrypart;
	}

	return $entrypart;
}

sub getDate{
	my($RSS) = $_[0];
	#utf8::decode($RSS);	
	if ($RSS =~ m|(?<=<syn:updateBase>)[-+T:\d]*?(?=</syn:updateBase>)|i){
		return $&;
	} elsif ($RSS =~ m|(?<=<dc:date>)[-+T:\d]*?(?=</dc:date>)|i){
		return $&;
	} else {
		return '';
	}

}

sub halt{
	$dbh->disconnect;
	exit;
}
