User talk:Davide Eynard
From semanticweb.org
Hi Davide,
I'm a student at the University of British Columbia in Vancouver, Canada, and I've got a wiki running for a science group I volunteer with.
I've been trying to set up a perl screen scrape of our news page so that I can generate a valid RSS feed, but my current script generates a feed with unwanted characters. Do you have any suggestions about how to improve it?
Thanks,
#!/usr/bin/perl
# script for creating RSS feed from Wikipedia Recent Announcements page
# note, XML::RSS does not support RSS 0.92.
use strict;
use diagnostics;
use LWP::UserAgent;
use HTTP::Date;
use constant GETFILE => 1;
$/ = undef;
my $url_link_base = "http://www.sciteam.ubc.ca/mw/index.php/SCI_Team:Current_News#";
my $url_announcements = "http://www.sciteam.ubc.ca/mw/index.php?title=SCI_Team:05-06/Current_News&action=edit";
my $url_meta = "http://meta.wikipedia.org/wiki/";
my $url_wiki = "http://en.wikipedia.org/wiki/";
my $file_template = "template.xml";
my $file_output = "output-new.txt";
my $file_rss = "news-new.xml";
my $file_original = "original-new.htm";
my $in;
if (GETFILE) {
# get the file remotely, otherwise just use what we have (save hits during debugging)
# Create a user agent object (spoof Mozilla since it rejects libwww)
my $ua = LWP::UserAgent->new;
$ua->agent("Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030428 Mozilla Firebird/0.6 StumbleUpon/1.73");
my $req = HTTP::Request->new(GET => $url_announcements);
my $res = $ua->request($req);
# Check the outcome of the response
if ($res->is_success) {
#print $res->content;
$in = $res->content;
open(ORIGINAL, '>', $file_original);
print ORIGINAL $in;
close(ORIGINAL);
} else {
print "Bad luck this time ", $res->code, "\n";
exit;
}
}
else {
# we are just getting the local file.
open(ORIGINAL, '<', $file_original);
$in = <ORIGINAL>;
close(ORIGINAL);
}
sub remove_html_tag {
my ($temp, $tag) = @_;
$temp =~ s/(<$tag.*?>|<\/$tag>)//gsmi;
return $temp;
}
sub anchor {
my ($temp) = @_;
$temp = linkify($temp);
$temp = $url_link_base . $temp;
return $temp;
}
sub escape {
my ($temp) = @_;
$temp =~ s/</</gism;
$temp =~ s/>/>/gism;
return $temp;
}
sub linkify {
my ($temp) = @_;
$temp =~ s/ /_/g;
return $temp;
}
sub createlink {
my ($url, $name) = @_;
my $temp = "<a href=\"$url\">$name</a>";
#print "Created link: $temp\n";
$temp = escape("<a href=\"$url\">$name</a>");
return $temp;
}
sub metalink {
my ($url, $name) = @_;
$url = linkify($url);
return createlink($url_meta. $url, $name);
}
sub wikilink {
my ($url, $name) = @_;
#print "Calling wikilink on:\n";
#print "URL: $url\n";
#print "Name: $name\n\n";
$url = linkify($url);
return createlink($url_wiki . $url, $name);
}
sub fixtitle {
my ($title) = @_;
$title =~ s/\[\[//g;
$title =~ s/]]//g;
return $title;
}
sub wiki2html {
my ($temp) = @_;
# * is break
$temp =~ s/\n[\*\:]/<br><br>/gism;
# deal with strong markups.
$temp =~ s/'''(.*?)'''/<strong>$1<\/strong>/gi;
# links to Meta (do we need this?
#$temp =~ s/\[\[m:([^|]*?)]]/metalink($1, $2)/gisme;
# links to Meta
$temp =~ s/\[\[m:(.*?)\|(.*?)]]/metalink($1, $2)/gisme;
#regular wiki links [[Canada]]
# | is a literal in square brackets.
$temp =~ s/\[\[([^|]*?)]]/wikilink($1, $1)/gisme;
# wiki links i.e. [[United States of America|USA]]
$temp =~ s/\[\[(.*?)\|(.*?)]]/wikilink($1, $2)/gisme;
# deal with external links
$temp =~ s/\[(.*?) (.*?)]/createlink($1, $2);/gisme;
# single form of external link
$temp =~ s/\[(.*?)]/createlink($1, $1);/gisme;
# remove starting <br><br> if necessary
$temp =~ s/\s*<br><br>//gism;
return $temp;
}
$in =~ m|<textarea.*?>(.*?)</textarea>|ism;
$in = $1;
$in =~ s|.*?==\s*?Current news.*?==||gism;
$in =~ s|==\s*?Number of article milestones\s*?==.*||gism;
$in =~ s|==\s*?Announcement Archives\s*?==.*||gism;
$in =~ s|==\s*?News Archives\s*?==.*||gism;
# get rid of stuff.
# store output
open(OUT, '>', $file_output);
print OUT $in;
close(OUT);
my @lines = split(/\n/, $in);
my @content;
my @result;
my @items;
@result = $in =~ m|==\s*(.*?)\s*==(.*?)(?===)|gism;
for (0 .. $#result) {
$result[$_] =~ s/^\s*?//gism;
$result[$_] =~ s/\s*?$//gism;
if ($_ % 2 == 1) {
# ODD if we are dealing with an item
# trim initial *.
$result[$_] =~ s/^\s*?\*//g;
@items = split /\n\*+/, $result[$_];
for (0 .. $#items) {
print "Item $_: $items[$_]\n\n";
}
$result[$_] = '<p>' . join('</p><p>', @items) . '</p>';
#print $result[$_] . "\n\n";
}
else {
# EVEN, do nothing with the date
}
}
for (0 .. $#result) {
# print "$_: $result[$_]\n";
}
for my $i (0 .. ($#result-1)/2) {
# print "$i *$result[$i]*\n";
my %temphash;
$temphash{"title"} = fixtitle($result[$i*2]);
$temphash{"link"} = anchor($result[$i*2]);
$temphash{"description"} = wiki2html($result[$i*2+1]);
push @content, \%temphash;
}
$/ = "\n";
open(TEMPLATE, '<', $file_template);
open(RSS, '>', $file_rss);
my $key;
while (<TEMPLATE>) {
if (/\$content\$/) {
for $key (0 .. $#content) {
my $item = "<item>\n";
$item .= "<title>" . $content[$key]->{"title"} . "</title>\n";
$item .= "<description>" . $content[$key]->{"description"} . "</description>\n";
$item .= "<link>" . $content[$key]->{"link"} . "</link>\n";
$item .= "</item>\n\n";
print RSS $item;
}
next;
}
if (/\$lastbuilddate\$/) {
my $temp = $_;
$temp =~ s/\$lastbuilddate\$/time2str(time)/e;
print RSS $temp;
next;
}
print RSS $_;
}
close(TEMPLATE);
close(RSS);
