User:FairuseBot/libBot.pm
Appearance
#!/usr/bin/perl # libBot: A Perl module of useful routines for running a bot ########## WARNING: NOT ALL OF THESE FUNCTIONS WILL WORK PROPERLY WITH Pearle.pm ########## package libBot; use strict; use warnings; use Pearle; use Data::Dumper; require Exporter; our @ISA = qw(Exporter); our @EXPORT = qw(config usernotify wikilog botwarnlog notelog LoadInfoboxPatterns FixupLinks MakeWikiRegex DoIHaveMessages GetPageCategories GetPageLinks GetPageText GetPageList GetFullPageList SaveImage UpdateLink RemoveImageFromPage IsNotified isDated getDate getUploadDates getLastEditDate GetImageUploader loadNotificationList saveNotificationList); our $VERSION = 1.00; my $test_only = 0; my $username = ""; my @infobox_patterns = (); sub config { my %params = @_; $test_only = $params{test_only} if(defined($params{test_only})); $username = $params{username} if(defined($params{username})); } # Log a warning on a user's talkpage, using an existing edit session sub usernotify { my ($wikipage, $text, $user, $summary); $wikipage = $_[1]; $summary = $_[2]; $summary = "Logging warning message" if(!defined($summary)); # We've been handed an editing session Pearle::myLog(4, "Warning with existing edit session\n"); if($test_only) { print STDERR $_[0]; return; } if($wikipage->getWikiText() =~ /^#redirect/i) { botwarnlog("*User talk page [[User talk:$user]] is a redirect\n"); return; } $text = $wikipage->getEditableText(); $text .= $_[0]; $wikipage->setEditableText($text); Pearle::postPage($wikipage, $summary, 0); print STDERR $_[0]; } # General-purpose on-Wiki logging routine sub wikilog { my($target, $text, $wikipage, $summary); $target = $_[0]; $summary = $_[2] || "Logging note"; $wikipage = Pearle::getPage($target); if($test_only) { print STDERR $_[1]; return; } $text = $wikipage->getEditableText(); $text .= $_[1]; $wikipage->setEditableText($text); Pearle::postPage($wikipage, $summary, 0); print STDERR $_[1]; } # Log a warning on the talk page of the bot sub botwarnlog { my ($page, $text, $summary); $text = $_[0]; $summary = $_[1]; $summary = "Logging warning message" if(!defined($summary)); $page = "User talk:${username}/log"; wikilog($page, $text, $summary); } # Log a notification message to the console sub notelog { print STDERR @_; } # Fix all wikilinks in a string so that they shows as a link, not inline, if it's for a category or image sub FixupLinks { my $link = shift; $link =~ s/\[\[(Category|Image)/[[:$1/g; return $link; } # Make a string into a Wikipedia-compatible regex sub MakeWikiRegex { my $string = shift; my @chars = split //, $string; my $result = ''; foreach my $char (@chars) { # Escape metacharacters, and add percent-encoding for certain characters if($char eq '\\') {$result .= '\\\\';} elsif($char eq '.') {$result .= '\.';} elsif($char eq '(') {$result .= '(?:\(|%28)';} elsif($char eq ')') {$result .= '(?:\)|%29)';} elsif($char eq '[') {$result .= '\[';} elsif($char eq ']') {$result .= '\]';} elsif($char eq '+') {$result .= '\+';} elsif($char eq '*') {$result .= '\*';} elsif($char eq '?') {$result .= '(?:\?|%3F)';} elsif($char eq '^') {$result .= '\^';} elsif($char eq '$') {$result .= '\$';} elsif($char eq '&') {$result .= '(?:&|%26)';} elsif($char eq '!') {$result .= '(?:!|%21)';} elsif($char eq '~') {$result .= '(?:~|%7E)';} elsif($char eq "'") {$result .= "(?:'|%27)";} elsif($char eq '"') {$result .= '(?:"|%22)';} elsif($char eq ',') {$result .= '(?:,|%2C)';} else {$result .= $char;} } # Process the string to match both with spaces and with underscores $result =~ s/[ _]/[ _]+/g; # Process the string to match both upcase and lowercase first characters if($result =~ /^[A-Za-z]/) { $result =~ s/^(.)/"[$1".lc($1)."]"/e; } return $result; } sub HTMLEncode { my $char = shift; return sprintf("&X%X;", ord($char)); } # Make a string into something that can match most image name formats sub MakeFancyRegex { my $string = shift; my @chars = split //, $string; my $result; foreach my $char (@chars) { if($char eq '\\') { $result .= "(\\\\|%5C|%5c|&x5C;)"; } elsif($char eq '.') { } elsif($char eq '(') { } elsif($char eq ')') { } else { $result .= "($char|" . uri_escape_utf8($char) . "|" . lc(uri_escape_utf8($char)) . "|" . HTMLEncode($char) . "|" . lc(HTMLEncode($char)) . ")"; } } return $result; } # Check for new talk page messages sub DoIHaveMessages { my $xml = shift; my $parsed_xml = Pearle::getXMLParser()->XMLin($xml); if(exists($parsed_xml->{query}->{userinfo}->{messages}) and defined($parsed_xml->{query}->{userinfo}->{messages})) { return 1; } else { return 0; } } sub GetPageCategories { my $image_data = shift; my @pages = (); if(defined($image_data)) { my $parsed_xml = Pearle::getXMLParser()->XMLin($image_data); Pearle::myLog(4, Dumper($parsed_xml)); if(exists($parsed_xml->{query}->{pages}->{page}->{categories}->{cl}) and defined($parsed_xml->{query}->{pages}->{page}->{categories}->{cl})) { if(ref($parsed_xml->{query}->{pages}->{page}->{categories}->{cl}) eq 'ARRAY') { my @all_pages = @{$parsed_xml->{query}->{pages}->{page}->{categories}->{cl}}; @pages = map {$_->{title}} @all_pages; } else { @pages = ($parsed_xml->{query}->{pages}->{page}->{categories}->{cl}->{title}); } } } return @pages; } sub GetPageLinks { my $image_data = shift; my @pages = (); if(defined($image_data)) { my $parsed_xml = Pearle::getXMLParser()->XMLin($image_data); Pearle::myLog(4, Dumper($parsed_xml)); if(exists($parsed_xml->{query}->{pages}->{page}->{links}->{pl}) and defined($parsed_xml->{query}->{pages}->{page}->{links}->{pl})) { if(ref($parsed_xml->{query}->{pages}->{page}->{links}->{pl}) eq 'ARRAY') { my @all_pages = @{$parsed_xml->{query}->{pages}->{page}->{links}->{pl}}; @pages = map {$_->{title}} @all_pages; } else { @pages = ($parsed_xml->{query}->{pages}->{page}->{links}->{pl}->{title}); } } } return @pages; } sub GetPageText { my $image_data = shift; my $text = undef; if(defined($image_data)) { my $parsed_xml = Pearle::getXMLParser()->XMLin($image_data); Pearle::myLog(4, Dumper($parsed_xml)); if(exists($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}) and defined($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev})) { $text = $parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}; } } return $text; } sub GetPageList { my $image_data = shift; my $image; my @pages = (); if(defined($image_data)) { my $parsed_xml = Pearle::getXMLParser()->XMLin($image_data); my $image = $parsed_xml->{query}->{pages}->{page}->{title}; Pearle::myLog(4, Dumper($parsed_xml)); if(exists($parsed_xml->{query}->{imageusage}->{iu}) and defined($parsed_xml->{query}->{imageusage}->{iu})) { if(ref($parsed_xml->{query}->{imageusage}->{iu}) eq 'ARRAY') { my @bad_pages = grep {$_->{ns} == 10 or $_->{ns} == 12} @{$parsed_xml->{query}->{imageusage}->{iu}}; my @good_pages = grep {$_->{ns} != 10 and $_->{ns} != 12} @{$parsed_xml->{query}->{imageusage}->{iu}}; @pages = map {$_->{title}} @good_pages; if(scalar(@bad_pages) > 0 and defined($image)) # If "image" is undefined, we're probably doing a pure usage check, rather than one in preparation for removal { my $notice; foreach my $page (@bad_pages) { $notice .= "*Found image [[:$image]] in [[$page->{title}]]\n"; } botwarnlog($notice); } } else { if($parsed_xml->{query}->{imageusage}->{iu}->{ns} != 10 and $parsed_xml->{query}->{imageusage}->{iu}->{ns} != 12) { @pages = $parsed_xml->{query}->{imageusage}->{iu}->{title}; } else { if(defined($image)) { botwarnlog("*Found image [[:$image]] in [[$parsed_xml->{query}->{imageusage}->{iu}->{title}]]\n"); } } } } } return @pages; } # Get all pages. Don't filter for bad namespaces. sub GetFullPageList { my $image = shift; my @pages = (); my $xml = Pearle::APIQuery(list => 'imageusage', iutitle => $image); if(defined($xml)) { my $parsed_xml = Pearle::getXMLParser()->XMLin($xml); Pearle::myLog(4, Dumper($parsed_xml)); if(exists($parsed_xml->{query}->{imageusage}->{iu}) and defined($parsed_xml->{query}->{imageusage}->{iu})) { if(ref() eq 'ARRAY') { @pages = map {$_->{title}} @{$parsed_xml->{query}->{imageusage}->{iu}}; } else { @pages = $parsed_xml->{query}->{imageusage}->{iu}->{title}; } } } return @pages; } sub UpdateLink { my $page = shift; my $from = shift; my $to = shift; die "No page to edit" if(!defined($page)); die "No link to change" if(!defined($from)); die "No new link" if(!defined($to)); Pearle::myLog(3, "Updating link from $from to $to\n"); my $wikipage = Pearle::getPage($page); $wikipage->canonicalizeLinks(); my $text = $wikipage->getEditableText(); my $link_regex = MakeWikiRegex($from); my $matches = $text =~ s/\x01($link_regex)\x02/\x01${to}|${1}\x02/gi; $matches += $text =~ s/\x01$link_regex\|/\x01${to}|/gi; $wikipage->setEditableText($text); Pearle::postPage( $wikipage, "Updating link to bypass a redirect or disambiguation page", 0); return $matches; } sub RemoveImageFromPage { my $image = shift; my $page = shift; my $image_regex = shift; my $removal_prefix = shift; my $removal_comment = shift; my $wikipage; my $text; my ($match1, $match2); my $old_length; my $new_length; my $change_len; my $match_len; tryagain: # Fetch an article page $wikipage = Pearle::getPage($page); $wikipage->canonicalizeLinks(); $text = $wikipage->getEditableText(); if(!defined($text)) { Pearle::myLog(1, "Error: Bad edit page [[$page]]\n"); botwarnlog(FixupLinks("*Error: Bad edit page [[$page]]\n")); sleep(300); return 0; } if($text =~ /^\s*$/) { # Might be protected instead of empty Pearle::myLog(1, "Error: Empty or protected page [[$page]]\n"); botwarnlog(FixupLinks("*Error: Empty or protected page [[$page]]\n")); sleep(300); return 0; } if($text =~ /^#redirect/i) { Pearle::myLog(1, "Redirect found for page [[$page]] (image [[:$image]])\n"); botwarnlog(FixupLinks("*Redirect found for page [[$page]] (image [[:$image]])\n")); print $text; return 0; } # Remove the image my $regex3 = "(\x01${image_regex}[^\x01]*?(\x01[^\x02]*?\x02[^\x01]*?|)+\x02[ \\t]*)"; # Regex to match images #my $regex3 = "( # \x01 # Open double-bracket for the image # ${image_regex} # The image itself # [^\x01]*? # Anything up to the first link in the caption, or a closing double bracket (minimal match) # (\x01 # Open double-bracket for a link in the caption # [^\x02]*? # Anything but a closing double-bracket # \x02 # The closing double-bracket for the link # [^\x01]*?|) # Any non-link text, or nothing # + # Matches one or more times # \x02 # The closing double-bracket for the image # [ \\t]*) # Any trailing whitespace # "; my $regex3ex = "\\w[ \\t]*${regex3}[ \\t]*\\w"; # Regex to try to spot inline images my $regex3g = "(${image_regex}.*)"; # Regex to match gallery images my ($raw_image) = $image =~ /Image:(.*)/; my $regex4m = "\x01[ _]*[Mm]edia[ _]*:[ _]*" . MakeWikiRegex($raw_image) . "[ _]*\\|([^]]*)\x02"; # Regex to match inline Media: links Pearle::myLog(3, "Regex 3: $regex3\n"); notelog("Regex 3: $regex3\n"); notelog("Regex 3 extended: $regex3ex\n"); notelog("Regex 3 gallery: $regex3g\n"); Pearle::myLog(3, "Raw regex: $raw_image\n"); notelog("Regex 4 Media: $regex4m\n"); if($text =~ /$regex3ex/) { Pearle::myLog(1, "Possible inline image in [[$page]]\n"); botwarnlog(FixupLinks("*Possible inline image [[:$image]] in [[$page]]\n")); return 0; # Can't do gallery matching because that also matches regular images, and odds are, we don't have an infobox } if($text =~ /-->/) # With the new editing markup, any close-comment means that somebody fucked up their wikimarkup { Pearle::myLog(3, "Fractional comment in page [[$page]]\n"); botwarnlog(FixupLinks("*Fractional comment in page [[$page]]\n")); } $text =~ /$regex3/; $match_len = length($1); $match2 = $text =~ s/$regex3/<!-- $removal_prefix $1 -->/g; $new_length = length($text); print "Num: $match2 Len: $match_len\n"; if($match2) { # If a whole lot of text was removed, log a warning if($match_len > (500 + length($image))) { botwarnlog(FixupLinks("*Long caption of $match_len bytes replaced in [[$page]]\n")); if($match_len > (1000 + length($image))) { Pearle::myLog(2, "Unusually long caption of $match_len found in [[$page]] ($match2 matches).\n"); print $text, "\n"; # exit; return 0; } } if($match_len < (2 + length($image))) { Pearle::myLog(0, "Short replacement of $match_len bytes (min " . (length($image) + 2) . ") in [[$page]] ($match2 matches). Exiting.\n"); Pearle::myLog(0, "Text:\n$text\n"); exit; } # If many matches, log a warning if($match2 > 2) { Pearle::myLog(3, "More than one match ($match2) in page [[$page]]\n"); # botwarnlog(FixupLinks("*More than one match ($match2) in page [[$page]]\n")); } if($match2 > 100) { Pearle::myLog(1, "Too many matches ($match2) in page [[$page]]. Skipping.\n"); botwarnlog("*Too many matches ($match2) in page [[$page]]. Skipping.\n"); return 0; } } # Put the text back and get it again in order to fold any comments resulting from removing non-gallery images. # This is because gallery image matching will also match commented images. $wikipage->setEditableText($text); $text = $wikipage->getEditableText(); if($text =~ /<gallery/i) { Pearle::myLog(3, "*Possible image gallery in page [[$page]]\n"); if($text =~ s/$regex3g/<!-- $removal_prefix $1 -->/g) { # if($match2 != 0) # { # botwarnlog("*Both a gallery and a non-gallery in [[$page]]\n"); # } $match2 += 1; } } if($match2 > 0) { if($text =~ /\[\[(?: |)<!--/) { Pearle::myLog(2, "Possible multiline image in page [[$page]]\n"); botwarnlog(FixupLinks("*Possible multiline image in page [[$page]]\n")); } } # Improved infobox removal my $infobox_regex = "([-A-Za-z0-9_]+[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*=)[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*" . "[ _]*" . MakeWikiRegex($raw_image) . "[ _]*"; my $infobox_regex_full = "([-A-Za-z0-9_]+[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*=)[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*" . '[Ii]mage[ _]*:[ _]*' . MakeWikiRegex($raw_image); if($text =~ /$infobox_regex/) { Pearle::myLog(3, "Matched on infobox regex: $infobox_regex\n"); Pearle::myLog(3, "Infobox parameter: $1\n"); if($& =~ /puic/) { botwarnlog(FixupLinks("*PUIC in page [[$page]]\n")); } else { my $sub = $1; $text =~ s/$infobox_regex/$sub/g; $match2 += 1; } } if($text =~ /$infobox_regex_full/) { Pearle::myLog(3, "Matched on infobox regex: $infobox_regex_full\n"); Pearle::myLog(3, "Infobox parameter: $1\n"); if($& =~ /puic/) { botwarnlog(FixupLinks("*PUIC in page [[$page]]\n")); } else { my $sub = $1; $text =~ s/$infobox_regex_full/$sub/g; $match2 += 1; } } if($match2) # No need to null-edit articles anymore { if($test_only) { notelog("Test removal from page succeeded\n"); } else { # Submit the changes $wikipage->setEditableText($text); eval { Pearle::postPage($wikipage, $removal_comment, 0); }; if($@) { if($@ =~ /^924 Spam filter: (.*)$/) { botwarnlog("*Spam filter on page [[$page]], url <nowiki>$1\n"); $match2 = 0; # We weren't able to remove it } elsif($@ =~ /^922/) { # Edit conflict. Try editing the page again. botwarnlog("*Edit conflict on page [[$page]]\n"); goto tryagain; } else { die; } } } } return ($match2) } # Returns 1 if the user has been notified, or 0 if they haven't sub IsNotified { my $uploader = shift; my $image_regex = shift; my $image_name = shift; my $notes_ref = shift; my $donts_ref = shift; # Check notification list if($notes_ref->{"$uploader,$image_name"}) { Pearle::myLog(3, "Already notified for this image\n"); return 1; } if($donts_ref->{$uploader}) { Pearle::myLog(3, "On exception list: $uploader\n"); return 1; } # # Check uploader's talkpage my $page_data = Pearle::APIQuery(titles => "User talk:$uploader", prop => 'links', plnamespace => 6); if($page_data =~ /$image_regex/) { Pearle::myLog(3, "Has a link from userpage\n"); return 1; } # my $wikipage = Pearle::getPage("User talk:$uploader"); # my $text = $wikipage->getWikiText(); # if($text =~ /$image_regex/) # { # Pearle::myLog(3, "Already notified by someone else\n"); # $donts_ref->{"$uploader,$image_name"} = 1; # return 1; # } # else # { # Pearle::myLog(3, "Not already notified\n"); # return $wikipage; # } return 0; } sub isDated { my $image_text = shift; if($image_text =~ /\((\d\d?) (\w*) (\d\d\d\d)\)/) # Dated template { myLog(4, "Dated tag $1 $2 $3\n"); return 1; } # as of 6 October 2006"> elsif($image_text =~ /as of (\d\d?) (\w*) (\d\d\d\d)/) # Template borked, working off category { myLog(4, "Template borked; category $1 $2 $3\n"); return 1; } elsif($image_text =~ /{{{day}}} {{{month}}} \d\d\d\d/ or $image_text =~ /\( 2006\)/) # Generic template { myLog(4, "Generic tag\n"); return 0; } else { myLog(4, "No tag match\n"); return 0; } } # Return the tag date if there is one, the upload date if not # Returns in (day, month, year) format sub getDate { my $image_text = shift; if($image_text =~ /\((\d\d?) (\w*) (\d\d\d\d)\)/) { myLog(4, "Template date $1-$2-$3\n"); return ($1, $2, $3); } elsif($image_text =~ /as of (\d\d?) (\w*) (\d\d\d\d)/) # Template borked, working off category { myLog(4, "Category date $1-$2-$3\n"); return ($1, $2, $3); } elsif($image_text =~ />\d\d?:\d\d, (\d\d?) (\w*) (\d\d\d\d)</) { myLog(4, "Upload date $1-$2-$3\n"); # return ($1, $2, $3); # For now, be conservative: my ($year, $month, $day) = Today(); return ($day, Month_to_Text($month), $year); } else { myLog(4, "No date\n"); return (1, "January", 2007); } } # Return a list of upload dates sub getUploadDates { my @dates; my $image_text = shift; while($image_text =~ />\d\d?:\d\d, (\d\d?) (\w*) (\d\d\d\d)</g) { push @dates, [$1, $2, $3]; } return @dates; } sub getLastEditDate { my ($day, $month, $year); my $image = shift; my @history = Pearle::parseHistory($image); (undef, $day, $month, $year) = @{$history[0]}; return ($day, $month, $year); } # Find the most recent non-vandal, non-revert uploader sub GetImageUploader { my $image_data = shift; my ($uploader, $sha1, $comment); my @uploaders; my $uploader_data; my $i = 0; my $count = 0; my $parsed_xml = Pearle::getXMLParser()->XMLin($image_data); Pearle::myLog(4, Dumper($parsed_xml)); if(exists($parsed_xml->{query}->{pages}->{page}->{imageinfo}->{ii}) and defined($parsed_xml->{query}->{pages}->{page}->{imageinfo}->{ii})) { if(ref($parsed_xml->{query}->{pages}->{page}->{imageinfo}->{ii}) eq 'ARRAY') { @uploaders = @{$parsed_xml->{query}->{pages}->{page}->{imageinfo}->{ii}}; } else { return $parsed_xml->{query}->{pages}->{page}->{imageinfo}->{ii}->{user}; } } else { return undef; } $uploader = $uploaders[0]->{user}; $sha1 = $uploaders[0]->{sha1}; $comment = $uploaders[0]->{comment} || ""; my $done = 0; while(!$done) { if($comment =~ /^Reverted/) { Pearle::myLog(4, "Revert found\n"); $i += 1; while($uploaders[$i]->{sha1} ne $sha1) { $i = $i + 1; } } elsif($comment =~ /optimi(z|s)ed|adjust|tweak|scale|crop|change|resize|remove/i) { Pearle::myLog(4, "Tweak found\n"); $i = $i + 1; } elsif(!defined($uploader)) { Pearle::myLog(4, "Something went wrong with finding the uploader\n"); $done = 1; } elsif($count > 500) { Pearle::myLog(4, "Took too long finding the uploader\n"); $uploader = undef; $done = 1; } else { $done = 1; } $uploader = $uploaders[$i]->{user}; $sha1 = $uploaders[$i]->{sha1}; $comment = $uploaders[$i]->{comment} || ""; $count = $count + 1; } if(defined($uploader)) { Pearle::myLog(4, "Uploader: $uploader\n"); return $uploader; } else { return undef; } } sub loadNotificationList { my $file = shift; my %notelist; my $i = 0; notelog("File: $file\n"); open INFILE, "<", $file; while(<INFILE>) { $_ =~ s/\s*#.*$//g; chomp; $notelist{$_} = 1; $i++; } close INFILE; notelog("$i notifications loaded\n"); return %notelist; } sub saveNotificationList { return if($test_only); my $file = shift; my %notelist = @_; my $key; open OUTFILE, ">", $file; foreach $key (keys(%notelist)) { print OUTFILE "$key\n"; } close OUTFILE; } 1; </nowiki>