User talk:Davide Eynard

From semanticweb.org

Jump to: navigation, search

Hi Davide,

I'm a student at the University of British Columbia in Vancouver, Canada, and I've got a wiki running for a science group I volunteer with.

I've been trying to set up a perl screen scrape of our news page so that I can generate a valid RSS feed, but my current script generates a feed with unwanted characters. Do you have any suggestions about how to improve it?

Thanks,

Kyle Hunter

#!/usr/bin/perl
# script for creating RSS feed from Wikipedia Recent Announcements page
# note, XML::RSS does not support RSS 0.92.

use strict;
use diagnostics;
use LWP::UserAgent;
use HTTP::Date;

use constant GETFILE => 1;

$/ = undef;

my $url_link_base = "http://www.sciteam.ubc.ca/mw/index.php/SCI_Team:Current_News#";
my $url_announcements = "http://www.sciteam.ubc.ca/mw/index.php?title=SCI_Team:05-06/Current_News&action=edit";
my $url_meta = "http://meta.wikipedia.org/wiki/";
my $url_wiki = "http://en.wikipedia.org/wiki/";
my $file_template = "template.xml";
my $file_output = "output-new.txt";
my $file_rss = "news-new.xml";
my $file_original = "original-new.htm";

my $in;

if (GETFILE) {
    # get the file remotely, otherwise just use what we have (save hits during debugging)
    # Create a user agent object (spoof Mozilla since it rejects libwww)
    my $ua = LWP::UserAgent->new;
    $ua->agent("Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030428 Mozilla Firebird/0.6 StumbleUpon/1.73");

    my $req = HTTP::Request->new(GET => $url_announcements);
    my $res = $ua->request($req);
    
    # Check the outcome of the response
    if ($res->is_success) {
        #print $res->content;
        $in = $res->content;
        open(ORIGINAL, '>', $file_original);
        print ORIGINAL $in;
        close(ORIGINAL);
    } else {
        print "Bad luck this time ", $res->code, "\n";
        exit;
    }
}
else {
    # we are just getting the local file.
    open(ORIGINAL, '<', $file_original);
    $in = <ORIGINAL>;
    close(ORIGINAL);
}

sub remove_html_tag {
    my ($temp, $tag) = @_;
    $temp =~ s/(<$tag.*?>|<\/$tag>)//gsmi;
    return $temp;
}

sub anchor {
    my ($temp) = @_;    
    $temp = linkify($temp);
    $temp = $url_link_base . $temp;
    return $temp;
}

sub escape {
    my ($temp) = @_;
    $temp =~ s/</</gism;
    $temp =~ s/>/>/gism;
    return $temp;
}

sub linkify {
    my ($temp) = @_;
    $temp =~ s/ /_/g;
    return $temp;
}

sub createlink {
    my ($url, $name) = @_;
    my $temp = "<a href=\"$url\">$name</a>";
    #print "Created link: $temp\n";
    $temp = escape("<a href=\"$url\">$name</a>");
    return $temp;
}

sub metalink {
    my ($url, $name) = @_;
    $url = linkify($url);
    return createlink($url_meta. $url, $name);
}

sub wikilink {
    my ($url, $name) = @_;
    #print "Calling wikilink on:\n";
    #print "URL: $url\n";
    #print "Name: $name\n\n";
    $url = linkify($url);
    
    return createlink($url_wiki . $url, $name);
}

sub fixtitle {
    my ($title) = @_;
    $title =~ s/\[\[//g;
    $title =~ s/]]//g;
    return $title;
}

sub wiki2html {
    my ($temp) = @_;
    # * is break
    $temp =~ s/\n[\*\:]/<br><br>/gism;
    # deal with strong markups.
    $temp =~ s/'''(.*?)'''/<strong>$1<\/strong>/gi;
    # links to Meta (do we need this?
    #$temp =~ s/\[\[m:([^|]*?)]]/metalink($1, $2)/gisme;
    # links to Meta
    $temp =~ s/\[\[m:(.*?)\|(.*?)]]/metalink($1, $2)/gisme;
    #regular wiki links [[Canada]] 
    # | is a literal in square brackets.
    $temp =~ s/\[\[([^|]*?)]]/wikilink($1, $1)/gisme;
    # wiki links i.e. [[United States of America|USA]]
    $temp =~ s/\[\[(.*?)\|(.*?)]]/wikilink($1, $2)/gisme;
    # deal with external links
    $temp =~ s/\[(.*?) (.*?)]/createlink($1, $2);/gisme; 
    # single form of external link
    $temp =~ s/\[(.*?)]/createlink($1, $1);/gisme; 
    # remove starting <br><br> if necessary
    $temp =~ s/\s*<br><br>//gism;
    return $temp;
}

$in =~ m|<textarea.*?>(.*?)</textarea>|ism;

$in = $1;

$in =~ s|.*?==\s*?Current news.*?==||gism;
$in =~ s|==\s*?Number of article milestones\s*?==.*||gism;
$in =~ s|==\s*?Announcement Archives\s*?==.*||gism;
$in =~ s|==\s*?News Archives\s*?==.*||gism;


# get rid of stuff.

# store output
open(OUT, '>', $file_output);
print OUT $in;
close(OUT);

my @lines = split(/\n/, $in);
my @content;
my @result;
my @items;

@result = $in =~ m|==\s*(.*?)\s*==(.*?)(?===)|gism;

for (0 .. $#result) {
    $result[$_] =~ s/^\s*?//gism;
    $result[$_] =~ s/\s*?$//gism;   
    
    if ($_ % 2 == 1) {
        # ODD if we are dealing with an item 

        # trim initial *.
        $result[$_] =~ s/^\s*?\*//g;
        
        
        @items = split /\n\*+/, $result[$_];
    
        for (0 .. $#items) {
            print "Item $_: $items[$_]\n\n";
        }

        $result[$_] = '<p>' . join('</p><p>', @items) . '</p>';

        #print $result[$_] . "\n\n";
    }
    else {
        # EVEN, do nothing with the date
    }
}

for (0 .. $#result) {
#   print "$_: $result[$_]\n";
}


for my $i (0 .. ($#result-1)/2) {
#   print "$i *$result[$i]*\n";
    my %temphash;
    $temphash{"title"} = fixtitle($result[$i*2]);
    $temphash{"link"} = anchor($result[$i*2]);
    $temphash{"description"} = wiki2html($result[$i*2+1]);
    push @content, \%temphash;
}


$/ = "\n";

open(TEMPLATE, '<', $file_template);
open(RSS, '>', $file_rss);

my $key;

while (<TEMPLATE>) {

    if (/\$content\$/) {
        for $key (0 .. $#content) {
            my $item = "<item>\n";
            $item .= "<title>" . $content[$key]->{"title"} . "</title>\n";
            $item .= "<description>" . $content[$key]->{"description"} . "</description>\n";
            $item .= "<link>" . $content[$key]->{"link"} . "</link>\n";
            $item .= "</item>\n\n"; 
            print RSS $item;
        }
        next;
    }
    if (/\$lastbuilddate\$/) {
        my $temp = $_;      
        $temp =~ s/\$lastbuilddate\$/time2str(time)/e;      
        print RSS $temp;
        next;
    }
    print RSS $_;
}
    
close(TEMPLATE);
close(RSS);
Personal tools