幻影的聚合程序

以前的聚合是用PHP写的抓取,我PHP能力太差了,因此写得很搓。这两天用Perl改写了一下,感觉还是perl更能让我觉得舒服。有几个朋友要,就贴出来一下。点名BS一下百度,为什么就不愿意遵守RSS标准?

  1. #!/usr/local/bin/perl
  2.  
  3. =pod
  4. CREATE TABLE `rss_feeds` (
  5.   `id` int(11) NOT NULL AUTO_INCREMENT,
  6.   `title` varchar(255) NOT NULL,
  7.   `author` varchar(50) NOT NULL,
  8.   `link` text NOT NULL,
  9.   `description` longtext NOT NULL,
  10.   `pub_date` int(11) DEFAULT NULL,
  11.   `content_md5` varchar(32) NOT NULL,
  12.   PRIMARY KEY (`id`),
  13.   UNIQUE KEY `content_idx` (`content_md5`)
  14. ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
  15.  
  16. CREATE TABLE `rss_sites` (
  17.   `id` int(11) NOT NULL AUTO_INCREMENT,
  18.   `username` varchar(255) DEFAULT NULL,
  19.   `site_url` varchar(255) DEFAULT NULL,
  20.   `rss_url` varchar(255) DEFAULT NULL,
  21.   `failed_times` int(11) DEFAULT '0',
  22.   PRIMARY KEY (`id`)
  23. ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
  24. =cut
  25.  
  26. use strict;
  27. use warnings;
  28. use DBI;
  29. use Encode;
  30. use LWP::Simple;
  31. use XML::Simple;
  32. use Digest::MD5 qw(md5_hex);
  33. use Date::Parse qw(str2time);
  34.  
  35. my $db_host = 'localhost';
  36. my $db_name = 'icylife';
  37. my $db_user = 'yunshu';
  38. my $db_pwd = 'pwd';
  39.  
  40. my $site_table = 'pst_rss_sites';
  41. my $feed_table = 'pst_rss_feeds';
  42.  
  43. my $dbh = DBI->connect( "DBI:mysql:database=$db_name;host=$db_host", $db_user, $db_pwd, {RaiseError => 1} );
  44. $dbh->do( 'set names "utf8"' ) or die $dbh->errstr;
  45.  
  46. my @urls = &GetRssUrl( );
  47. if( ! @urls )
  48. {
  49. exit;
  50. }
  51.  
  52. foreach my $url( @urls )
  53. {
  54. &ProcessRss( $url );
  55. }
  56. $dbh->disconnect;
  57.  
  58. sub GetRssUrl
  59. {
  60. my @urls = ( );
  61.  
  62.  
  63. my $sth = $dbh->prepare( 'select * from '.$site_table );
  64. eval
  65. {
  66. $sth->execute( );
  67. };
  68. if( $@ )
  69. {
  70. warn "get site list from db failed.\n";
  71. return undef;
  72. }
  73.  
  74. while( my $row = $sth->fetchrow_hashref )
  75. {
  76. push( @urls, $row->{rss_url} );
  77. }
  78. $sth->finish;
  79.  
  80. return @urls;
  81. }
  82.  
  83. sub ProcessRss
  84. {
  85. my $url = shift;
  86. my $xml = get( $url );
  87.  
  88. if( !defined($xml) )
  89. {
  90. warn localtime()." get $url failed.\n";
  91. $dbh->do( 'update '.$site_table.' set failed_times=failed_times+1 where rss_url="'.$url.'"');
  92.  
  93. return;
  94. }
  95.  
  96. $xml = decode_utf8($xml);
  97. $xml =~ s/encoding="(.*?)"/encoding="utf-8"/;
  98.  
  99. my $rss;
  100. eval
  101. {
  102. $rss = XMLin( $xml );
  103. };
  104. if( $@ )
  105. {
  106. warn localtime()." Parse $url failed: $@n";
  107. $dbh->do( 'update '.$site_table.' set failed_times=failed_times+1 where rss_url="'.$url.'"');
  108.  
  109. return;
  110. }
  111. $dbh->do( 'update '.$site_table.' set failed_times=0 where rss_url="'.$url.'"');
  112.  
  113. foreach my $item( @{$rss->{channel}{item}} )
  114. {
  115. if( !defined($item->{description})|| !defined($item->{link}) || !defined($item->{title}) || !defined($item->{pubDate}) )
  116. {
  117. next;
  118. }
  119. if( !defined($item->{author}) && !defined($item->{'dc:creator'}) )
  120. {
  121. $item->{author} = 'unknow';
  122. }
  123. if( !defined($item->{pubDate}) )
  124. {
  125. $item->{pubDate} = 'unknow';
  126. }
  127.  
  128. my $pubdate = str2time( $item->{pubDate} );
  129. my $author = $item->{author} || $item->{'dc:creator'};
  130. my $link = $item->{link};
  131. my $title = $item->{title};
  132. my $description = $item->{description};
  133.  
  134. =pod
  135. if( $url =~ /^http://hi.baidu.com// )
  136. {
  137. $pubdate += 8 * 3600;
  138. }
  139. print encode("utf8", "$author $link $title ".localtime($pubdate)." ");
  140. print md5_hex(encode("utf8",$description))."n";
  141. =cut
  142. eval
  143. {
  144. $dbh->do( 'insert into '.$feed_table.'(title, author, link, description, pub_date, content_md5) values(?,?,?,?,?,?)', undef, $title, $author, $link, $description, $pubdate, md5_hex(encode("utf8",$description)) );
  145. };
  146. =pod
  147. if( $@ )
  148. {
  149. #warn $dbh->errstr."\n";
  150. }
  151. =cut
  152. }
  153. }
  154.  

此条目发表在技术分类目录。将固定链接加入收藏夹。

幻影的聚合程序》有 3 条评论

  1. System说:

    脚本小子 世界因你而精彩

  2. never说:

    大公司都是如此的嘛

发表评论

电子邮件地址不会被公开。 必填项已用*标注