User:FairuseBot/libBot.pm: Difference between revisions
Appearance
Content deleted Content added
New version |
Updated |
||
Line 16: | Line 16: | ||
our @ISA = qw(Exporter); |
our @ISA = qw(Exporter); |
||
our @EXPORT = qw(config usernotify wikilog botwarnlog notelog LoadInfoboxPatterns FixupLinks MakeWikiRegex DoIHaveMessages GetPageCategories GetPageTemplates |
our @EXPORT = qw(config usernotify wikilog botwarnlog notelog LoadInfoboxPatterns FixupLinks MakeWikiRegex DoIHaveMessages GetPageCategories GetPageTemplates GetLinksOnPage GetPageText GetPageList GetFullPageList GetImageNames SaveImage UpdateLink RemoveImageFromPage ReplaceImage IsNotified IsPageNotified isDated getDate getUploadDates getLastEditDate GetImageUploader loadNotificationList saveNotificationList usesTemplate DoesPageExist); |
||
our $VERSION = 1.00; |
our $VERSION = 1.00; |
||
Line 69: | Line 69: | ||
sub wikilog |
sub wikilog |
||
{ |
{ |
||
my($target, $text, $ |
my($target, $text, $token, $summary); |
||
$target = $_[0]; |
$target = $_[0]; |
||
$text = $_[1]; |
|||
$summary = $_[2] || "Logging note"; |
$summary = $_[2] || "Logging note"; |
||
chomp($text); |
|||
$text = "\n$text" if($text !~ /^\n/); # The edit API eats trailing newlines, so prepend a newline if the message doesn't already have one. |
|||
eval |
eval |
||
{ |
{ |
||
$ |
$token = Pearle::getToken($target); |
||
}; |
}; |
||
if($@) |
if($@) |
||
Line 82: | Line 86: | ||
{ |
{ |
||
Pearle::myLog(1, "Failed to notify: Protected page $target\n"); |
Pearle::myLog(1, "Failed to notify: Protected page $target\n"); |
||
return; |
|||
} |
} |
||
else |
else |
||
Line 95: | Line 100: | ||
} |
} |
||
Pearle::appendToPage($target, $token, $text, $summary, 0); |
|||
if($wikipage->getWikiText() =~ /^#redirect/i) |
|||
{ |
|||
botwarnlog("*Log target [[:$target]] is a redirect\n"); |
|||
return; |
|||
} |
|||
$text = $wikipage->getEditableText(); |
|||
$text .= $_[1]; |
|||
$wikipage->setEditableText($text); |
|||
Pearle::postPage($wikipage, $summary, 0); |
|||
print STDERR $_[1]; |
|||
} |
} |
||
Line 131: | Line 125: | ||
{ |
{ |
||
my $link = shift; |
my $link = shift; |
||
$link =~ s/\[\[(Category|Image)/[[:$1/g; |
$link =~ s/\[\[(Category|Image|File)/[[:$1/g; |
||
return $link; |
return $link; |
||
} |
} |
||
Line 141: | Line 135: | ||
my @chars = split //, $string; |
my @chars = split //, $string; |
||
my $result = ''; |
my $result = ''; |
||
return undef if(!defined($string)); |
|||
foreach my $char (@chars) |
foreach my $char (@chars) |
||
Line 146: | Line 142: | ||
# Escape metacharacters, and add percent-encoding for certain characters |
# Escape metacharacters, and add percent-encoding for certain characters |
||
if($char eq '\\') {$result .= '\\\\';} |
if($char eq '\\') {$result .= '\\\\';} |
||
elsif($char eq '/') {$result .= '\/';} |
elsif($char eq '/') {$result .= '(?i:\/|%2F)';} |
||
elsif($char eq '.') {$result .= '\.';} |
elsif($char eq '.') {$result .= '\.';} |
||
elsif($char eq '(') {$result .= '(?:\(|%28)';} |
elsif($char eq '(') {$result .= '(?i:\(|%28)';} |
||
elsif($char eq ')') {$result .= '(?:\)|%29)';} |
elsif($char eq ')') {$result .= '(?i:\)|%29)';} |
||
elsif($char eq '[') {$result .= '\[';} |
elsif($char eq '[') {$result .= '\[';} |
||
elsif($char eq ']') {$result .= '\]';} |
elsif($char eq ']') {$result .= '\]';} |
||
elsif($char eq '+') {$result .= '\+';} |
elsif($char eq '+') {$result .= '\+';} |
||
elsif($char eq '*') {$result .= '\*';} |
elsif($char eq '*') {$result .= '\*';} |
||
elsif($char eq '?') {$result .= '(?:\?|%3F)';} |
elsif($char eq '?') {$result .= '(?i:\?|%3F)';} |
||
elsif($char eq '^') {$result .= '\^';} |
elsif($char eq '^') {$result .= '\^';} |
||
elsif($char eq '$') {$result .= '\$';} |
elsif($char eq '$') {$result .= '\$';} |
||
elsif($char eq '&') {$result .= '(?:&|%26)';} |
elsif($char eq '&') {$result .= '(?i:&|%26)';} |
||
elsif($char eq '!') {$result .= '(?:!|%21)';} |
elsif($char eq '!') {$result .= '(?i:!|%21)';} |
||
elsif($char eq '~') {$result .= '(?:~|%7E)';} |
elsif($char eq '~') {$result .= '(?i:~|%7E)';} |
||
elsif($char eq "'") {$result .= "(?:'|%27)";} |
elsif($char eq "'") {$result .= "(?i:'|%27)";} |
||
elsif($char eq '"') {$result .= '(?:"|%22)';} |
elsif($char eq '"') {$result .= '(?i:"|%22)';} |
||
elsif($char eq ',') {$result .= '(?:,|%2C)';} |
elsif($char eq ',') {$result .= '(?i:,|%2C)';} |
||
else {$result .= $char;} |
else {$result .= $char;} |
||
} |
} |
||
Line 258: | Line 254: | ||
} |
} |
||
sub |
sub GetLinksOnPage |
||
{ |
{ |
||
my $xml = shift; |
my $xml = shift; |
||
Line 297: | Line 293: | ||
Pearle::myLog(4, Dumper($parsed_xml)); |
Pearle::myLog(4, Dumper($parsed_xml)); |
||
if(exists($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}) and defined($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev})) |
if(exists($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{content}) and defined($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{content})) |
||
{ |
{ |
||
if(ref($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev})) |
if(ref($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{content})) |
||
{ |
{ |
||
# The API/XML parser interact to produce a HASH ref if the revision is empty |
# The API/XML parser interact to produce a HASH ref if the revision is empty |
||
Line 306: | Line 302: | ||
else |
else |
||
{ |
{ |
||
$text = $parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}; |
$text = $parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{content}; |
||
} |
} |
||
} |
} |
||
Line 348: | Line 344: | ||
if(defined($xml)) |
if(defined($xml)) |
||
{ |
{ |
||
my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml); |
my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml, ForceArray => ['iu']); |
||
my $image = $parsed_xml->{query}->{pages}->{page}->{title}; |
my $image = $parsed_xml->{query}->{pages}->{page}->{title}; |
||
Line 355: | Line 351: | ||
if(exists($parsed_xml->{query}->{imageusage}->{iu}) and defined($parsed_xml->{query}->{imageusage}->{iu})) |
if(exists($parsed_xml->{query}->{imageusage}->{iu}) and defined($parsed_xml->{query}->{imageusage}->{iu})) |
||
{ |
{ |
||
my @bad_pages = grep {$_->{ns} == 10 or $_->{ns} == 12} @{$parsed_xml->{query}->{imageusage}->{iu}}; |
|||
my @good_pages = grep {$_->{ns} != 10 and $_->{ns} != 12} @{$parsed_xml->{query}->{imageusage}->{iu}}; |
|||
@pages = map {$_->{title}} @good_pages; |
|||
if(scalar(@bad_pages) > 0 and defined($image)) # If "image" is undefined, we're probably doing a pure usage check, rather than one in preparation for removal |
|||
{ |
{ |
||
my $notice; |
|||
my @bad_pages = grep {$_->{ns} == 10 or $_->{ns} == 12} @{$parsed_xml->{query}->{imageusage}->{iu}}; |
|||
foreach my $page (@bad_pages) |
|||
my @good_pages = grep {$_->{ns} != 10 and $_->{ns} != 12} @{$parsed_xml->{query}->{imageusage}->{iu}}; |
|||
@pages = map {$_->{title}} @good_pages; |
|||
if(scalar(@bad_pages) > 0 and defined($image)) # If "image" is undefined, we're probably doing a pure usage check, rather than one in preparation for removal |
|||
{ |
{ |
||
$notice .= "*Found image [[:$image]] in [[$page->{title}]]\n"; |
|||
my $notice; |
|||
foreach my $page (@bad_pages) |
|||
{ |
|||
$notice .= "*Found image [[:$image]] in [[$page->{title}]]\n"; |
|||
} |
|||
botwarnlog($notice); |
|||
} |
|||
} |
|||
else |
|||
{ |
|||
if($parsed_xml->{query}->{imageusage}->{iu}->{ns} != 10 and $parsed_xml->{query}->{imageusage}->{iu}->{ns} != 12) |
|||
{ |
|||
@pages = $parsed_xml->{query}->{imageusage}->{iu}->{title}; |
|||
} |
|||
else |
|||
{ |
|||
if(defined($image)) |
|||
{ |
|||
botwarnlog("*Found image [[:$image]] in [[$parsed_xml->{query}->{imageusage}->{iu}->{title}]]\n"); |
|||
} |
|||
} |
} |
||
botwarnlog($notice); |
|||
} |
} |
||
} |
} |
||
Line 395: | Line 374: | ||
sub GetFullPageList |
sub GetFullPageList |
||
{ |
{ |
||
my $ |
my $xml = shift; |
||
my $image; |
|||
my @pages = (); |
my @pages = (); |
||
my $xml = Pearle::APIQuery(list => 'imageusage', iutitle => $image); |
|||
if(defined($xml)) |
if(defined($xml)) |
||
{ |
{ |
||
my $parsed_xml = Pearle::getXMLParser()->XMLin($xml); |
my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml, ForceArray => ['iu']); |
||
my $image = $parsed_xml->{query}->{pages}->{page}->{title}; |
|||
Pearle::myLog(4, Dumper($parsed_xml)); |
Pearle::myLog(4, Dumper($parsed_xml)); |
||
Line 407: | Line 387: | ||
if(exists($parsed_xml->{query}->{imageusage}->{iu}) and defined($parsed_xml->{query}->{imageusage}->{iu})) |
if(exists($parsed_xml->{query}->{imageusage}->{iu}) and defined($parsed_xml->{query}->{imageusage}->{iu})) |
||
{ |
{ |
||
@pages = map {$_->{title}} @{$parsed_xml->{query}->{imageusage}->{iu}}; |
|||
if(ref() eq 'ARRAY') |
|||
{ |
|||
@pages = map {$_->{title}} @{$parsed_xml->{query}->{imageusage}->{iu}}; |
|||
} |
|||
else |
|||
{ |
|||
@pages = $parsed_xml->{query}->{imageusage}->{iu}->{title}; |
|||
} |
|||
} |
} |
||
} |
} |
||
Line 420: | Line 393: | ||
return @pages; |
return @pages; |
||
} |
} |
||
# Input: XML from the API, generated with list => 'backlinks'. blfilterredir => 'redirects' is recommended but not mandatory. |
|||
# Either as text or as a parsed tree |
|||
# |
|||
# Returns: A list of redirects to the image |
|||
# |
|||
# Side effects: None |
|||
sub GetImageNames |
|||
{ |
|||
my $xml = shift; |
|||
my @names; |
|||
if(defined($xml)) |
|||
{ |
|||
my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml, ForceArray => ['bl']); |
|||
if(exists($parsed_xml->{query}->{backlinks}->{bl}) and defined($parsed_xml->{query}->{backlinks}->{bl})) |
|||
{ |
|||
@names = map {$_->{title}} grep( {defined($_->{redirect})} @{$parsed_xml->{query}->{backlinks}->{bl}}); |
|||
} |
|||
} |
|||
return @names; |
|||
} |
|||
sub UpdateLink |
sub UpdateLink |
||
Line 478: | Line 474: | ||
$wikipage->canonicalizeLinks(); |
$wikipage->canonicalizeLinks(); |
||
$text = $wikipage->getEditableText(); |
$text = $wikipage->getEditableText(); |
||
if(!defined($text)) |
if(!defined($text)) |
||
{ |
{ |
||
Line 505: | Line 501: | ||
# Remove the image |
# Remove the image |
||
my $regex3 = "(\x01${image_regex}[^\x01]*?(\x01[^\x02]*?\x02[^\x01]*?|)+\x02[ \\t]*)"; # Regex to match images |
my $regex3 = "([ \\t]*\x01${image_regex}[^\x01]*?(\x01[^\x02]*?\x02[^\x01]*?|)+\x02[ \\t]*)"; # Regex to match images |
||
#my $regex3 = "( |
#my $regex3 = "( |
||
# [ \\t]* # Any leading whitespace |
|||
# \x01 # Open double-bracket for the image |
# \x01 # Open double-bracket for the image |
||
# ${image_regex} # The image itself |
# ${image_regex} # The image itself |
||
Line 522: | Line 519: | ||
my $regex3ex = "\\w[ \\t]*${regex3}[ \\t]*\\w"; # Regex to try to spot inline images |
my $regex3ex = "\\w[ \\t]*${regex3}[ \\t]*\\w"; # Regex to try to spot inline images |
||
my $regex3g = "(${image_regex}.*)"; # Regex to match gallery images |
my $regex3g = "(${image_regex}.*)"; # Regex to match gallery images |
||
my ($raw_image) = $image =~ /Image:(.*)/; |
my ($raw_image) = $image =~ /(?:Image|File):(.*)/; |
||
my $regex4m = "\x01[ _]*[Mm]edia[ _]*:[ _]*" . MakeWikiRegex($raw_image) . "[ _]*\\|([^]]*)\x02"; # Regex to match inline Media: links |
|||
Pearle::myLog(3, "Regex 3: $regex3\n"); |
Pearle::myLog(3, "Regex 3: $regex3\n"); |
||
Line 530: | Line 525: | ||
notelog("Regex 3 gallery: $regex3g\n"); |
notelog("Regex 3 gallery: $regex3g\n"); |
||
Pearle::myLog(3, "Raw regex: $raw_image\n"); |
Pearle::myLog(3, "Raw regex: $raw_image\n"); |
||
notelog("Regex 4 Media: $regex4m\n"); |
|||
if($text =~ /$regex3ex/) |
if($text =~ /$regex3ex/) |
||
Line 538: | Line 532: | ||
return 0; # Can't do gallery matching because that also matches regular images, and odds are, we don't have an infobox |
return 0; # Can't do gallery matching because that also matches regular images, and odds are, we don't have an infobox |
||
} |
} |
||
# Not worth checking for anymore |
|||
# if($text =~ /-->/) # With the new editing markup, any close-comment means that somebody fucked up their wikimarkup |
|||
# { |
|||
# Pearle::myLog(3, "Fractional comment in page [[$page]]\n"); |
|||
# botwarnlog(FixupLinks("*Fractional comment in page [[$page]]\n")); |
|||
# } |
|||
$text =~ /$regex3/; |
$text =~ /$regex3/; |
||
$match_len = length($1); |
$match_len = length($1); |
||
my @matches = $text =~ /$regex3/g; |
|||
if(grep {$_ =~ /[\x{F0000}-\x{FFFFF}]/} @matches) # If any images have comments in their captions, we can't remove them |
|||
{ |
|||
botwarnlog(FixupLinks("*Comment in image in [[$page]]\n")); |
|||
goto skipregular; |
|||
} |
|||
if(defined($removal_prefix)) |
if(defined($removal_prefix)) |
||
{ |
{ |
||
Line 561: | Line 555: | ||
if($match2) |
if($match2) |
||
{ |
{ |
||
# If a whole lot of text was removed, log a warning |
|||
if($match_len > (500 + length($image))) |
|||
{ |
|||
botwarnlog(FixupLinks("*Long caption of $match_len bytes replaced in [[$page]]\n")); |
|||
if($match_len > (1000 + length($image))) |
|||
{ |
|||
Pearle::myLog(2, "Unusually long caption of $match_len found in [[$page]] ($match2 matches).\n"); |
|||
print $text, "\n"; |
|||
# exit; |
|||
return 0; |
|||
} |
|||
} |
|||
if($match_len < (2 + length($image))) |
if($match_len < (2 + length($image))) |
||
{ |
{ |
||
Line 585: | Line 567: | ||
{ |
{ |
||
Pearle::myLog(3, "More than one match ($match2) in page [[$page]]\n"); |
Pearle::myLog(3, "More than one match ($match2) in page [[$page]]\n"); |
||
# botwarnlog(FixupLinks("*More than one match ($match2) in page [[$page]]\n")); |
|||
} |
} |
||
if($match2 > 100) |
if($match2 > 100) |
||
Line 595: | Line 576: | ||
} |
} |
||
skipregular: |
|||
# Put the text back and get it again in order to fold any comments resulting from removing non-gallery images. |
# Put the text back and get it again in order to fold any comments resulting from removing non-gallery images. |
||
# This is because gallery image matching will also match commented images. |
# This is because gallery image matching will also match commented images. |
||
Line 630: | Line 612: | ||
# Improved infobox removal |
# Improved infobox removal |
||
my $infobox_regex = "([-A-Za-z0-9_]+[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*=)[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . MakeWikiRegex($raw_image) . "[ \x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*"; |
my $infobox_regex = "([-A-Za-z0-9_]+[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*=)[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . MakeWikiRegex($raw_image) . "[ \x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*"; |
||
my $infobox_regex_full = "([-A-Za-z0-9_]+[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*=)[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . '[Ii] |
my $infobox_regex_full = "([-A-Za-z0-9_]+[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*=)[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . '(?:[Ii][Mm][Aa][Gg][Ee]|[Ff][Ii][Ll][Ee])[ _]*:[ _]*' . MakeWikiRegex($raw_image) . "[ \x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*"; |
||
if($text =~ /$infobox_regex/) |
if($text =~ /$infobox_regex/) |
||
{ |
{ |
||
Line 658: | Line 640: | ||
{ |
{ |
||
my $sub = $1; |
my $sub = $1; |
||
my $match_regex = MakeWikiRegex($sub) . "[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . '[Ii] |
my $match_regex = MakeWikiRegex($sub) . "[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . '(?:[Ii][Mm][Aa][Gg][Ee]|[Ff][Ii][Ll][Ee])[ _]*:[ _]*' . MakeWikiRegex($raw_image) . "[ \x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*"; |
||
$text =~ s/$match_regex/$sub/g; |
$text =~ s/$match_regex/$sub/g; |
||
$match2 += 1; |
$match2 += 1; |
||
Line 677: | Line 659: | ||
{ |
{ |
||
Pearle::postPage($wikipage, $removal_comment, 0); |
Pearle::postPage($wikipage, $removal_comment, 0); |
||
}; |
|||
if($@) |
|||
{ |
|||
if($@ =~ /^924 Spam filter: (.*)$/) |
|||
{ |
|||
botwarnlog("*Spam filter on page [[$page]], url <nowiki>$1</nowiki>\n"); |
|||
$match2 = 0; # We weren't able to remove it |
|||
} |
|||
elsif($@ =~ /^922/) |
|||
{ |
|||
# Edit conflict. Try editing the page again. |
|||
botwarnlog("*Edit conflict on page [[$page]]\n"); |
|||
goto tryagain; |
|||
} |
|||
else |
|||
{ |
|||
die; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
return ($match2) |
|||
} |
|||
sub ReplaceImage |
|||
{ |
|||
my $image = shift; |
|||
my $page = shift; |
|||
my $image_regex = shift; |
|||
my $new_image = shift; |
|||
my $reason = shift; |
|||
my $wikipage; |
|||
my $text; |
|||
my ($match1, $match2); |
|||
my $old_length; |
|||
my $new_length; |
|||
my $change_len; |
|||
my $match_len; |
|||
tryagain: |
|||
# Fetch an article page |
|||
$wikipage = Pearle::getPage($page); |
|||
$wikipage->canonicalizeLinks(); |
|||
$text = $wikipage->getEditableText(); |
|||
if(!defined($text)) |
|||
{ |
|||
Pearle::myLog(1, "Error: Bad edit page [[$page]]\n"); |
|||
botwarnlog(FixupLinks("*Error: Bad edit page [[$page]]\n")); |
|||
sleep(300); |
|||
return 0; |
|||
} |
|||
if($text =~ /^\s*$/) |
|||
{ |
|||
# Might be protected instead of empty |
|||
Pearle::myLog(1, "Error: Empty or protected page [[$page]]\n"); |
|||
botwarnlog(FixupLinks("*Error: Empty or protected page [[$page]]\n")); |
|||
sleep(300); |
|||
return 0; |
|||
} |
|||
if($text =~ /^#redirect/i) |
|||
{ |
|||
Pearle::myLog(1, "Redirect found for page [[$page]] (image [[:$image]])\n"); |
|||
botwarnlog(FixupLinks("*Redirect found for page [[$page]] (image [[:$image]])\n")); |
|||
print $text; |
|||
return 0; |
|||
} |
|||
# Remove the image |
|||
my $regex3 = "([ \\t]*\x01)${image_regex}([^\x01]*?(\x01[^\x02]*?\x02[^\x01]*?|)+\x02[ \\t]*)"; # Regex to match images |
|||
#my $regex3 = "( |
|||
# [ \\t]* # Any leading whitespace |
|||
# \x01 # Open double-bracket for the image |
|||
# ${image_regex} # The image itself |
|||
# [^\x01]*? # Anything up to the first link in the caption, or a closing double bracket (minimal match) |
|||
# (\x01 # Open double-bracket for a link in the caption |
|||
# [^\x02]*? # Anything but a closing double-bracket |
|||
# \x02 # The closing double-bracket for the link |
|||
# [^\x01]*?|) # Any non-link text, or nothing |
|||
# + # Matches one or more times |
|||
# \x02 # The closing double-bracket for the image |
|||
# [ \\t]*) # Any trailing whitespace |
|||
# "; |
|||
my $regex3g = "${image_regex}(.*)"; # Regex to match gallery images |
|||
my ($raw_image) = $image =~ /(?:Image|File):(.*)/; |
|||
my $regex4m = "(\x01[ _]*[Mm][Ee][Dd][Ii][Aa][ _]*:[ _]*)" . MakeWikiRegex($raw_image) . "([ _]*\\|([^]]*)\x02)"; # Regex to match inline Media: links |
|||
Pearle::myLog(3, "Regex 3: $regex3\n"); |
|||
notelog("Regex 3 gallery: $regex3g\n"); |
|||
Pearle::myLog(3, "Raw regex: $raw_image\n"); |
|||
notelog("Regex 4 Media: $regex4m\n"); |
|||
$old_length = length($text); |
|||
$match2 = $text =~ s/$regex3/$1$new_image$2/g; |
|||
$new_length = length($text); |
|||
print "Num: $match2 Len: $match_len\n"; |
|||
if($match2) |
|||
{ |
|||
# If the length change isn't right, log a warning and return without saving |
|||
if($new_length != $old_length - ($match2 * (length($image) - length($new_image)))) |
|||
{ |
|||
botwarnlog(FixupLinks("*Length mismatch on [[$page]] replacing [[$image]] with [[$new_image]]\n")); |
|||
Pearle::myLog(1, "Length mismatch on $page replacing [[$image]] with [{$new_image]]\n"); |
|||
return 0; |
|||
} |
|||
# If many matches, log a warning |
|||
if($match2 > 2) |
|||
{ |
|||
Pearle::myLog(3, "More than one match ($match2) in page [[$page]]\n"); |
|||
} |
|||
if($match2 > 100) |
|||
{ |
|||
Pearle::myLog(1, "Too many matches ($match2) in page [[$page]]. Skipping.\n"); |
|||
botwarnlog("*Too many matches ($match2) in page [[$page]]. Skipping.\n"); |
|||
return 0; |
|||
} |
|||
} |
|||
if($text =~ /<gallery/i) |
|||
{ |
|||
Pearle::myLog(3, "*Possible image gallery in page [[$page]]\n"); |
|||
if($new_image =~ /$regex3g/) |
|||
{ |
|||
Pearle::myLog(2, "New image name $new_image is a substring of old image name $image\n"); |
|||
botwarnlog("*New image [[$new_image]] is a substring of [[$image]] in page [[$page]]\n"); |
|||
} |
|||
else |
|||
{ |
|||
my $gallery_matches = $text =~ s/$regex3g/$new_image$1/g; |
|||
$match2 += $gallery_matches; |
|||
} |
|||
} |
|||
# Improved infobox replacement |
|||
my $infobox_regex = "([-A-Za-z0-9_]+[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*=)[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . MakeWikiRegex($raw_image) . "[ \x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*"; |
|||
my $infobox_regex_full = "([-A-Za-z0-9_]+[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*=)[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . '(?:[Ii][Mm][Aa][Gg][Ee]|[Ff][Ii][Ll][Ee])[[ _]*:[ _]*' . MakeWikiRegex($raw_image) . "[ \x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*"; |
|||
if($text =~ /$infobox_regex/) |
|||
{ |
|||
Pearle::myLog(3, "Matched on infobox regex: $infobox_regex\n"); |
|||
Pearle::myLog(3, "Infobox parameter: $1\n"); |
|||
if($& =~ /puic/) |
|||
{ |
|||
botwarnlog(FixupLinks("*PUIC in page [[$page]]\n")); |
|||
} |
|||
else |
|||
{ |
|||
my $sub = $1; |
|||
my $match_regex = MakeWikiRegex($sub) . "[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . MakeWikiRegex($raw_image) . "[ \x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*"; |
|||
my $non_prefixed_new_image = $new_image; |
|||
$non_prefixed_new_image =~ s/^(Image|File)://; |
|||
$text =~ s/$match_regex/${sub}${non_prefixed_new_image}/g; |
|||
$match2 += 1; |
|||
} |
|||
} |
|||
if($text =~ /$infobox_regex_full/) |
|||
{ |
|||
Pearle::myLog(3, "Matched on infobox regex: $infobox_regex_full\n"); |
|||
Pearle::myLog(3, "Infobox parameter: $1\n"); |
|||
if($& =~ /puic/) |
|||
{ |
|||
botwarnlog(FixupLinks("*PUIC in page [[$page]]\n")); |
|||
} |
|||
else |
|||
{ |
|||
my $sub = $1; |
|||
my $match_regex = MakeWikiRegex($sub) . "[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . '(?:[Ii][Mm][Aa][Gg][Ee]|[Ff][Ii][Ll][Ee])[ _]*:[ _]*' . MakeWikiRegex($raw_image) . "[ \x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*"; |
|||
$text =~ s/$match_regex/${sub}$new_image/g; |
|||
$match2 += 1; |
|||
} |
|||
} |
|||
if($match2) # No need to null-edit articles anymore |
|||
{ |
|||
if($test_only) |
|||
{ |
|||
notelog("Test removal from page succeeded\n"); |
|||
} |
|||
else |
|||
{ |
|||
# Submit the changes |
|||
$wikipage->setEditableText($text); |
|||
eval |
|||
{ |
|||
Pearle::postPage($wikipage, $reason, 0); |
|||
}; |
}; |
||
if($@) |
if($@) |
||
Line 811: | Line 988: | ||
elsif($image_text =~ /as of (\d\d?) (\w*) (\d\d\d\d)/) # Template borked, working off category |
elsif($image_text =~ /as of (\d\d?) (\w*) (\d\d\d\d)/) # Template borked, working off category |
||
{ |
{ |
||
Pearle::myLog(4, "Category date $1-$2-$3\n"); |
Pearle::myLog(4, "Category date 'as of' $1-$2-$3\n"); |
||
return ($1, $2, $3); |
|||
} |
|||
elsif($image_text =~ /from (\d\d?) (\w*) (\d\d\d\d)/) # Alternate category naming |
|||
{ |
|||
Pearle::myLog(4, "Category date 'from' $1-$2-$3\n"); |
|||
return ($1, $2, $3); |
return ($1, $2, $3); |
||
} |
} |
||
Line 890: | Line 1,072: | ||
} |
} |
||
} |
} |
||
elsif($comment =~ /optimi(z|s)ed|adjust|tweak|scale|crop|change|resize|remove| |
elsif($comment =~ /optimi(z|s)ed|adjust|tweak|scale|crop|change|resize|remove|reduc(e|ing)/i) |
||
{ |
{ |
||
Pearle::myLog(4, "Tweak found\n"); |
Pearle::myLog(4, "Tweak found\n"); |
||
Line 1,001: | Line 1,183: | ||
} |
} |
||
return $result; |
return $result; |
||
} |
|||
sub DoesPageExist |
|||
{ |
|||
my $page = shift; |
|||
my $xml = Pearle::APIQuery(titles => [$page], prop => ['info']); |
|||
# TODO: Handle API query errors ('undef' return values) |
|||
if($xml =~ /missing=""/) |
|||
{ |
|||
return 0; |
|||
} |
|||
return 1; |
|||
} |
} |
||
Latest revision as of 11:54, 9 November 2012
#!/usr/bin/perl # libBot: A Perl module of useful routines for running a bot package libBot; use strict; use warnings; use Pearle; use Data::Dumper; use Array::Utils; require Exporter; our @ISA = qw(Exporter); our @EXPORT = qw(config usernotify wikilog botwarnlog notelog LoadInfoboxPatterns FixupLinks MakeWikiRegex DoIHaveMessages GetPageCategories GetPageTemplates GetLinksOnPage GetPageText GetPageList GetFullPageList GetImageNames SaveImage UpdateLink RemoveImageFromPage ReplaceImage IsNotified IsPageNotified isDated getDate getUploadDates getLastEditDate GetImageUploader loadNotificationList saveNotificationList usesTemplate DoesPageExist); our $VERSION = 1.00; my $test_only = 0; my $username = ""; my @infobox_patterns = (); sub config { my %params = @_; $test_only = $params{test_only} if(defined($params{test_only})); $username = $params{username} if(defined($params{username})); } # Log a warning on a user's talkpage, using an existing edit session sub usernotify { my ($wikipage, $text, $user, $summary); $wikipage = $_[1]; $summary = $_[2]; $summary = "Logging warning message" if(!defined($summary)); if(!$wikipage->isa("Pearle::WikiPage")) { Pearle::myLog(0, "usernotify(): Second parameter is not a WikiPage object\n"); die "usernotify(): Second parameter is not a WikiPage object\n"; } # We've been handed an editing session Pearle::myLog(4, "Warning with existing edit session\n"); if($test_only) { print STDERR $_[0]; return; } if($wikipage->getWikiText() =~ /^#redirect/i) { botwarnlog("*User talk page [[User talk:$user]] is a redirect\n"); return; } $text = $wikipage->getEditableText(); $text .= $_[0]; $wikipage->setEditableText($text); Pearle::postPage($wikipage, $summary, 0); print STDERR $_[0]; } # General-purpose on-Wiki logging routine sub wikilog { my($target, $text, $token, $summary); $target = $_[0]; $text = $_[1]; $summary = $_[2] || "Logging note"; chomp($text); $text = "\n$text" if($text !~ /^\n/); # The edit API eats trailing newlines, so prepend a newline if the message doesn't already have one. eval { $token = Pearle::getToken($target); }; if($@) { if($@ =~ /^925/) { Pearle::myLog(1, "Failed to notify: Protected page $target\n"); return; } else { die; } } if($test_only) { print STDERR $_[1]; return; } Pearle::appendToPage($target, $token, $text, $summary, 0); } # Log a warning on the talk page of the bot sub botwarnlog { my ($page, $text, $summary); $text = $_[0]; $summary = $_[1]; $summary = "Logging warning message" if(!defined($summary)); $page = "User talk:${username}/log"; wikilog($page, $text, $summary); } # Log a notification message to the console sub notelog { print STDERR @_; } # Fix all wikilinks in a string so that they shows as a link, not inline, if it's for a category or image sub FixupLinks { my $link = shift; $link =~ s/\[\[(Category|Image|File)/[[:$1/g; return $link; } # Make a string into a Wikipedia-compatible regex sub MakeWikiRegex { my $string = shift; my @chars = split //, $string; my $result = ''; return undef if(!defined($string)); foreach my $char (@chars) { # Escape metacharacters, and add percent-encoding for certain characters if($char eq '\\') {$result .= '\\\\';} elsif($char eq '/') {$result .= '(?i:\/|%2F)';} elsif($char eq '.') {$result .= '\.';} elsif($char eq '(') {$result .= '(?i:\(|%28)';} elsif($char eq ')') {$result .= '(?i:\)|%29)';} elsif($char eq '[') {$result .= '\[';} elsif($char eq ']') {$result .= '\]';} elsif($char eq '+') {$result .= '\+';} elsif($char eq '*') {$result .= '\*';} elsif($char eq '?') {$result .= '(?i:\?|%3F)';} elsif($char eq '^') {$result .= '\^';} elsif($char eq '$') {$result .= '\$';} elsif($char eq '&') {$result .= '(?i:&|%26)';} elsif($char eq '!') {$result .= '(?i:!|%21)';} elsif($char eq '~') {$result .= '(?i:~|%7E)';} elsif($char eq "'") {$result .= "(?i:'|%27)";} elsif($char eq '"') {$result .= '(?i:"|%22)';} elsif($char eq ',') {$result .= '(?i:,|%2C)';} else {$result .= $char;} } # Process the string to match both with spaces and with underscores $result =~ s/[ _]/[ _]+/g; # Process the string to match both upcase and lowercase first characters if($result =~ /^[A-Za-z]/) { $result =~ s/^(.)/"[$1".lc($1)."]"/e; } return $result; } sub HTMLEncode { my $char = shift; return sprintf("&X%X;", ord($char)); } # Make a string into something that can match most image name formats sub MakeFancyRegex { my $string = shift; my @chars = split //, $string; my $result; foreach my $char (@chars) { if($char eq '\\') { $result .= "(\\\\|%5C|%5c|&x5C;)"; } elsif($char eq '.') { } elsif($char eq '(') { } elsif($char eq ')') { } else { $result .= "($char|" . uri_escape_utf8($char) . "|" . lc(uri_escape_utf8($char)) . "|" . HTMLEncode($char) . "|" . lc(HTMLEncode($char)) . ")"; } } return $result; } # Check for new talk page messages sub DoIHaveMessages { my $xml = shift; my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml); if(exists($parsed_xml->{query}->{userinfo}->{messages}) and defined($parsed_xml->{query}->{userinfo}->{messages})) { return 1; } else { return 0; } } sub GetPageCategories { my $xml = shift; my @pages = (); if(defined($xml)) { my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml); Pearle::myLog(4, Dumper($parsed_xml)); if(exists($parsed_xml->{query}->{pages}->{page}->{categories}->{cl}) and defined($parsed_xml->{query}->{pages}->{page}->{categories}->{cl})) { if(ref($parsed_xml->{query}->{pages}->{page}->{categories}->{cl}) eq 'ARRAY') { my @all_pages = @{$parsed_xml->{query}->{pages}->{page}->{categories}->{cl}}; @pages = map {$_->{title}} @all_pages; } else { @pages = ($parsed_xml->{query}->{pages}->{page}->{categories}->{cl}->{title}); } } } return @pages; } sub GetLinksOnPage { my $xml = shift; my @pages = (); if(defined($xml)) { my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml); Pearle::myLog(4, Dumper($parsed_xml)); if(exists($parsed_xml->{query}->{pages}->{page}->{links}->{pl}) and defined($parsed_xml->{query}->{pages}->{page}->{links}->{pl})) { if(ref($parsed_xml->{query}->{pages}->{page}->{links}->{pl}) eq 'ARRAY') { my @all_pages = @{$parsed_xml->{query}->{pages}->{page}->{links}->{pl}}; @pages = map {$_->{title}} @all_pages; } else { @pages = ($parsed_xml->{query}->{pages}->{page}->{links}->{pl}->{title}); } } } return @pages; } sub GetPageText { my $xml = shift; my $text = undef; if(defined($xml)) { my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml); Pearle::myLog(4, Dumper($parsed_xml)); if(exists($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{content}) and defined($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{content})) { if(ref($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{content})) { # The API/XML parser interact to produce a HASH ref if the revision is empty $text = ""; } else { $text = $parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{content}; } } } return $text; } # Input: XML from the API, generated with prop => 'templates' and with only a single title # Either as text or as a parsed tree # # Returns: A list of templates used by the page # # Side effects: None sub GetPageTemplates { my $xml = shift; my @templates; if(defined($xml)) { my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml, ForceArray => ['tl']); if(exists($parsed_xml->{query}->{pages}->{page}->{templates}->{tl}) and defined($parsed_xml->{query}->{pages}->{page}->{templates}->{tl})) { @templates = map {$_->{title}} @{$parsed_xml->{query}->{pages}->{page}->{templates}->{tl}}; } } return @templates; } # Input: XML, either a tree produced by parsing, or XML text # # Returns: A list of pages that this image is used on # # Side effects: For pages in certain namespaces, posts on the bot's log page sub GetPageList { my $xml = shift; my $image; my @pages = (); if(defined($xml)) { my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml, ForceArray => ['iu']); my $image = $parsed_xml->{query}->{pages}->{page}->{title}; Pearle::myLog(4, Dumper($parsed_xml)); if(exists($parsed_xml->{query}->{imageusage}->{iu}) and defined($parsed_xml->{query}->{imageusage}->{iu})) { my @bad_pages = grep {$_->{ns} == 10 or $_->{ns} == 12} @{$parsed_xml->{query}->{imageusage}->{iu}}; my @good_pages = grep {$_->{ns} != 10 and $_->{ns} != 12} @{$parsed_xml->{query}->{imageusage}->{iu}}; @pages = map {$_->{title}} @good_pages; if(scalar(@bad_pages) > 0 and defined($image)) # If "image" is undefined, we're probably doing a pure usage check, rather than one in preparation for removal { my $notice; foreach my $page (@bad_pages) { $notice .= "*Found image [[:$image]] in [[$page->{title}]]\n"; } botwarnlog($notice); } } } return @pages; } # Get all pages. Don't filter for bad namespaces. sub GetFullPageList { my $xml = shift; my $image; my @pages = (); if(defined($xml)) { my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml, ForceArray => ['iu']); my $image = $parsed_xml->{query}->{pages}->{page}->{title}; Pearle::myLog(4, Dumper($parsed_xml)); if(exists($parsed_xml->{query}->{imageusage}->{iu}) and defined($parsed_xml->{query}->{imageusage}->{iu})) { @pages = map {$_->{title}} @{$parsed_xml->{query}->{imageusage}->{iu}}; } } return @pages; } # Input: XML from the API, generated with list => 'backlinks'. blfilterredir => 'redirects' is recommended but not mandatory. # Either as text or as a parsed tree # # Returns: A list of redirects to the image # # Side effects: None sub GetImageNames { my $xml = shift; my @names; if(defined($xml)) { my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml, ForceArray => ['bl']); if(exists($parsed_xml->{query}->{backlinks}->{bl}) and defined($parsed_xml->{query}->{backlinks}->{bl})) { @names = map {$_->{title}} grep( {defined($_->{redirect})} @{$parsed_xml->{query}->{backlinks}->{bl}}); } } return @names; } sub UpdateLink { my $page = shift; my $from = shift; my $to = shift; my $summary = shift || "Updating link to bypass a redirect or disambiguation page"; die "No page to edit" if(!defined($page)); die "No link to change" if(!defined($from)); die "No new link" if(!defined($to)); Pearle::myLog(3, "Updating link from $from to $to for page $page\n"); my $wikipage = Pearle::getPage($page); $wikipage->canonicalizeLinks(); my $text = $wikipage->getEditableText(); my $link_regex = MakeWikiRegex($from); my $matches = $text =~ s/\x01($link_regex)\x02/\x01${to}|${1}\x02/gi; $matches += $text =~ s/\x01$link_regex([#|])/\x01${to}${1}/gi; $matches += $text =~ s/([^=]\s*=\s*)$link_regex(\s*[|\n])/${1}$to${2}/gi; $wikipage->setEditableText($text); print $text; if($matches > 0) { Pearle::postPage( $wikipage, $summary, 0); } else { Pearle::myLog(3, "No update\n"); } return $matches; } sub RemoveImageFromPage { my $image = shift; my $page = shift; my $image_regex = shift; my $removal_prefix = shift; my $removal_comment = shift; my $wikipage; my $text; my ($match1, $match2); my $old_length; my $new_length; my $change_len; my $match_len; tryagain: # Fetch an article page $wikipage = Pearle::getPage($page); $wikipage->canonicalizeLinks(); $text = $wikipage->getEditableText(); if(!defined($text)) { Pearle::myLog(1, "Error: Bad edit page [[$page]]\n"); botwarnlog(FixupLinks("*Error: Bad edit page [[$page]]\n")); sleep(300); return 0; } if($text =~ /^\s*$/) { # Might be protected instead of empty Pearle::myLog(1, "Error: Empty or protected page [[$page]]\n"); botwarnlog(FixupLinks("*Error: Empty or protected page [[$page]]\n")); sleep(300); return 0; } if($text =~ /^#redirect/i) { Pearle::myLog(1, "Redirect found for page [[$page]] (image [[:$image]])\n"); botwarnlog(FixupLinks("*Redirect found for page [[$page]] (image [[:$image]])\n")); print $text; return 0; } # Remove the image my $regex3 = "([ \\t]*\x01${image_regex}[^\x01]*?(\x01[^\x02]*?\x02[^\x01]*?|)+\x02[ \\t]*)"; # Regex to match images #my $regex3 = "( # [ \\t]* # Any leading whitespace # \x01 # Open double-bracket for the image # ${image_regex} # The image itself # [^\x01]*? # Anything up to the first link in the caption, or a closing double bracket (minimal match) # (\x01 # Open double-bracket for a link in the caption # [^\x02]*? # Anything but a closing double-bracket # \x02 # The closing double-bracket for the link # [^\x01]*?|) # Any non-link text, or nothing # + # Matches one or more times # \x02 # The closing double-bracket for the image # [ \\t]*) # Any trailing whitespace # "; my $regex3ex = "\\w[ \\t]*${regex3}[ \\t]*\\w"; # Regex to try to spot inline images my $regex3g = "(${image_regex}.*)"; # Regex to match gallery images my ($raw_image) = $image =~ /(?:Image|File):(.*)/; Pearle::myLog(3, "Regex 3: $regex3\n"); notelog("Regex 3 extended: $regex3ex\n"); notelog("Regex 3 gallery: $regex3g\n"); Pearle::myLog(3, "Raw regex: $raw_image\n"); if($text =~ /$regex3ex/) { Pearle::myLog(1, "Possible inline image in [[$page]]\n"); botwarnlog(FixupLinks("*Possible inline image [[:$image]] in [[$page]]\n")); return 0; # Can't do gallery matching because that also matches regular images, and odds are, we don't have an infobox } $text =~ /$regex3/; $match_len = length($1); my @matches = $text =~ /$regex3/g; if(grep {$_ =~ /[\x{F0000}-\x{FFFFF}]/} @matches) # If any images have comments in their captions, we can't remove them { botwarnlog(FixupLinks("*Comment in image in [[$page]]\n")); goto skipregular; } if(defined($removal_prefix)) { $match2 = $text =~ s/$regex3/<!-- $removal_prefix $1 -->/g; } else { $match2 = $text =~ s/$regex3//g; } $new_length = length($text); print "Num: $match2 Len: $match_len\n"; if($match2) { if($match_len < (2 + length($image))) { Pearle::myLog(0, "Short replacement of $match_len bytes (min " . (length($image) + 2) . ") in [[$page]] ($match2 matches). Exiting.\n"); Pearle::myLog(0, "Text:\n$text\n"); print Dumper($1); print Dumper($image); exit; } # If many matches, log a warning if($match2 > 2) { Pearle::myLog(3, "More than one match ($match2) in page [[$page]]\n"); } if($match2 > 100) { Pearle::myLog(1, "Too many matches ($match2) in page [[$page]]. Skipping.\n"); botwarnlog("*Too many matches ($match2) in page [[$page]]. Skipping.\n"); return 0; } } skipregular: # Put the text back and get it again in order to fold any comments resulting from removing non-gallery images. # This is because gallery image matching will also match commented images. $wikipage->setEditableText($text); $text = $wikipage->getEditableText(); if($text =~ /<gallery/i) { Pearle::myLog(3, "*Possible image gallery in page [[$page]]\n"); if(defined($removal_prefix)) { if($text =~ s/$regex3g/<!-- $removal_prefix $1 -->/g) { $match2 += 1; } } else { if($text =~ s/$regex3g//g) { $match2 += 1; } } } if($match2 > 0) { if($text =~ /\[\[(?: |)<!--/) { Pearle::myLog(2, "Possible multiline image in page [[$page]]\n"); botwarnlog(FixupLinks("*Possible multiline image in page [[$page]]\n")); } } # Improved infobox removal my $infobox_regex = "([-A-Za-z0-9_]+[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*=)[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . MakeWikiRegex($raw_image) . "[ \x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*"; my $infobox_regex_full = "([-A-Za-z0-9_]+[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*=)[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . '(?:[Ii][Mm][Aa][Gg][Ee]|[Ff][Ii][Ll][Ee])[ _]*:[ _]*' . MakeWikiRegex($raw_image) . "[ \x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*"; if($text =~ /$infobox_regex/) { Pearle::myLog(3, "Matched on infobox regex: $infobox_regex\n"); Pearle::myLog(3, "Infobox parameter: $1\n"); if($& =~ /puic/) { botwarnlog(FixupLinks("*PUIC in page [[$page]]\n")); } else { my $sub = $1; my $match_regex = MakeWikiRegex($sub) . "[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . MakeWikiRegex($raw_image) . "[ \x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*"; $text =~ s/$match_regex/$sub/g; $match2 += 1; } } if($text =~ /$infobox_regex_full/) { Pearle::myLog(3, "Matched on infobox regex: $infobox_regex_full\n"); Pearle::myLog(3, "Infobox parameter: $1\n"); if($& =~ /puic/) { botwarnlog(FixupLinks("*PUIC in page [[$page]]\n")); } else { my $sub = $1; my $match_regex = MakeWikiRegex($sub) . "[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . '(?:[Ii][Mm][Aa][Gg][Ee]|[Ff][Ii][Ll][Ee])[ _]*:[ _]*' . MakeWikiRegex($raw_image) . "[ \x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*"; $text =~ s/$match_regex/$sub/g; $match2 += 1; } } if($match2) # No need to null-edit articles anymore { if($test_only) { notelog("Test removal from page succeeded\n"); } else { # Submit the changes $wikipage->setEditableText($text); eval { Pearle::postPage($wikipage, $removal_comment, 0); }; if($@) { if($@ =~ /^924 Spam filter: (.*)$/) { botwarnlog("*Spam filter on page [[$page]], url <nowiki>$1\n"); $match2 = 0; # We weren't able to remove it } elsif($@ =~ /^922/) { # Edit conflict. Try editing the page again. botwarnlog("*Edit conflict on page [[$page]]\n"); goto tryagain; } else { die; } } } } return ($match2) } sub ReplaceImage { my $image = shift; my $page = shift; my $image_regex = shift; my $new_image = shift; my $reason = shift; my $wikipage; my $text; my ($match1, $match2); my $old_length; my $new_length; my $change_len; my $match_len; tryagain: # Fetch an article page $wikipage = Pearle::getPage($page); $wikipage->canonicalizeLinks(); $text = $wikipage->getEditableText(); if(!defined($text)) { Pearle::myLog(1, "Error: Bad edit page [[$page]]\n"); botwarnlog(FixupLinks("*Error: Bad edit page [[$page]]\n")); sleep(300); return 0; } if($text =~ /^\s*$/) { # Might be protected instead of empty Pearle::myLog(1, "Error: Empty or protected page [[$page]]\n"); botwarnlog(FixupLinks("*Error: Empty or protected page [[$page]]\n")); sleep(300); return 0; } if($text =~ /^#redirect/i) { Pearle::myLog(1, "Redirect found for page [[$page]] (image [[:$image]])\n"); botwarnlog(FixupLinks("*Redirect found for page [[$page]] (image [[:$image]])\n")); print $text; return 0; } # Remove the image my $regex3 = "([ \\t]*\x01)${image_regex}([^\x01]*?(\x01[^\x02]*?\x02[^\x01]*?|)+\x02[ \\t]*)"; # Regex to match images #my $regex3 = "( # [ \\t]* # Any leading whitespace # \x01 # Open double-bracket for the image # ${image_regex} # The image itself # [^\x01]*? # Anything up to the first link in the caption, or a closing double bracket (minimal match) # (\x01 # Open double-bracket for a link in the caption # [^\x02]*? # Anything but a closing double-bracket # \x02 # The closing double-bracket for the link # [^\x01]*?|) # Any non-link text, or nothing # + # Matches one or more times # \x02 # The closing double-bracket for the image # [ \\t]*) # Any trailing whitespace # "; my $regex3g = "${image_regex}(.*)"; # Regex to match gallery images my ($raw_image) = $image =~ /(?:Image|File):(.*)/; my $regex4m = "(\x01[ _]*[Mm][Ee][Dd][Ii][Aa][ _]*:[ _]*)" . MakeWikiRegex($raw_image) . "([ _]*\\|([^]]*)\x02)"; # Regex to match inline Media: links Pearle::myLog(3, "Regex 3: $regex3\n"); notelog("Regex 3 gallery: $regex3g\n"); Pearle::myLog(3, "Raw regex: $raw_image\n"); notelog("Regex 4 Media: $regex4m\n"); $old_length = length($text); $match2 = $text =~ s/$regex3/$1$new_image$2/g; $new_length = length($text); print "Num: $match2 Len: $match_len\n"; if($match2) { # If the length change isn't right, log a warning and return without saving if($new_length != $old_length - ($match2 * (length($image) - length($new_image)))) { botwarnlog(FixupLinks("*Length mismatch on [[$page]] replacing [[$image]] with [[$new_image]]\n")); Pearle::myLog(1, "Length mismatch on $page replacing [[$image]] with [{$new_image]]\n"); return 0; } # If many matches, log a warning if($match2 > 2) { Pearle::myLog(3, "More than one match ($match2) in page [[$page]]\n"); } if($match2 > 100) { Pearle::myLog(1, "Too many matches ($match2) in page [[$page]]. Skipping.\n"); botwarnlog("*Too many matches ($match2) in page [[$page]]. Skipping.\n"); return 0; } } if($text =~ /<gallery/i) { Pearle::myLog(3, "*Possible image gallery in page [[$page]]\n"); if($new_image =~ /$regex3g/) { Pearle::myLog(2, "New image name $new_image is a substring of old image name $image\n"); botwarnlog("*New image [[$new_image]] is a substring of [[$image]] in page [[$page]]\n"); } else { my $gallery_matches = $text =~ s/$regex3g/$new_image$1/g; $match2 += $gallery_matches; } } # Improved infobox replacement my $infobox_regex = "([-A-Za-z0-9_]+[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*=)[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . MakeWikiRegex($raw_image) . "[ \x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*"; my $infobox_regex_full = "([-A-Za-z0-9_]+[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*=)[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . '(?:[Ii][Mm][Aa][Gg][Ee]|[Ff][Ii][Ll][Ee])[[ _]*:[ _]*' . MakeWikiRegex($raw_image) . "[ \x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*"; if($text =~ /$infobox_regex/) { Pearle::myLog(3, "Matched on infobox regex: $infobox_regex\n"); Pearle::myLog(3, "Infobox parameter: $1\n"); if($& =~ /puic/) { botwarnlog(FixupLinks("*PUIC in page [[$page]]\n")); } else { my $sub = $1; my $match_regex = MakeWikiRegex($sub) . "[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . MakeWikiRegex($raw_image) . "[ \x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*"; my $non_prefixed_new_image = $new_image; $non_prefixed_new_image =~ s/^(Image|File)://; $text =~ s/$match_regex/${sub}${non_prefixed_new_image}/g; $match2 += 1; } } if($text =~ /$infobox_regex_full/) { Pearle::myLog(3, "Matched on infobox regex: $infobox_regex_full\n"); Pearle::myLog(3, "Infobox parameter: $1\n"); if($& =~ /puic/) { botwarnlog(FixupLinks("*PUIC in page [[$page]]\n")); } else { my $sub = $1; my $match_regex = MakeWikiRegex($sub) . "[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*" . '(?:[Ii][Mm][Aa][Gg][Ee]|[Ff][Ii][Ll][Ee])[ _]*:[ _]*' . MakeWikiRegex($raw_image) . "[ \x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}_]*"; $text =~ s/$match_regex/${sub}$new_image/g; $match2 += 1; } } if($match2) # No need to null-edit articles anymore { if($test_only) { notelog("Test removal from page succeeded\n"); } else { # Submit the changes $wikipage->setEditableText($text); eval { Pearle::postPage($wikipage, $reason, 0); }; if($@) { if($@ =~ /^924 Spam filter: (.*)$/) { botwarnlog("*Spam filter on page [[$page]], url $1\n"); $match2 = 0; # We weren't able to remove it } elsif($@ =~ /^922/) { # Edit conflict. Try editing the page again. botwarnlog("*Edit conflict on page [[$page]]\n"); goto tryagain; } else { die; } } } } return ($match2) } # Returns 1 if the user has been notified, or 0 if they haven't sub IsNotified { my $uploader = shift; my $image_regex = shift; my $image_name = shift; my $notes_ref = shift; my $donts_ref = shift; # Check notification list if(defined($notes_ref) and $notes_ref->{"$uploader,$image_name"}) { Pearle::myLog(2, "Already notified for this image\n"); return 1; } if(defined($donts_ref) and $donts_ref->{$uploader}) { Pearle::myLog(2, "On exception list: $uploader\n"); return 1; } # Check uploader's talkpage my $page_data = Pearle::APIQuery(titles => "User talk:$uploader", prop => ['links', 'templates'], plnamespace => 6, pllimit => 500, tlnamespace => 10, tllimit => 500); $image_regex = MakeWikiRegex($image_name) if(!defined($image_regex)); if($page_data =~ /$image_regex/) { Pearle::myLog(2, "Has a link from userpage\n"); return 1; } if(usesTemplate($page_data, "Template:Nobots")) { Pearle::myLog(2, "Uses {{nobots}}\n"); return 1; } return 0; } # Returns 1 if the page has been notified, or 0 if it hasn't sub IsPageNotified { my $page = shift; my $image_regex = shift; my $image_name = shift; my $notes_ref = shift; my $donts_ref = shift; # Check notification list if($notes_ref->{"$page,$image_name"}) { Pearle::myLog(2, "Already notified for this image\n"); return 1; } if($donts_ref->{$page}) { Pearle::myLog(2, "On exception list: $page\n"); return 1; } # Check page my $page_data = Pearle::APIQuery(titles => $page, prop => 'links', plnamespace => 6); if($page_data =~ /$image_regex/) { Pearle::myLog(2, "Has a link from page\n"); return 1; } return 0; } sub isDated { my $image_text = shift; if($image_text =~ /\((\d\d?) (\w*) (\d\d\d\d)\)/) # Dated template { Pearle::myLog(4, "Dated tag $1 $2 $3\n"); return 1; } # as of 6 October 2006"> elsif($image_text =~ /as of (\d\d?) (\w*) (\d\d\d\d)/) # Template borked, working off category { Pearle::myLog(4, "Template borked; category $1 $2 $3\n"); return 1; } elsif($image_text =~ /{{{day}}} {{{month}}} \d\d\d\d/) # Generic template { Pearle::myLog(4, "Generic tag\n"); return 0; } else { Pearle::myLog(4, "No tag match\n"); return 0; } } # Return the tag date if there is one, the upload date if not # Returns in (day, month, year) format sub getDate { my $image_text = shift; if($image_text =~ /\((\d\d?) (\w*) (\d\d\d\d)\)/) { Pearle::myLog(4, "Template date $1-$2-$3\n"); return ($1, $2, $3); } elsif($image_text =~ /as of (\d\d?) (\w*) (\d\d\d\d)/) # Template borked, working off category { Pearle::myLog(4, "Category date 'as of' $1-$2-$3\n"); return ($1, $2, $3); } elsif($image_text =~ /from (\d\d?) (\w*) (\d\d\d\d)/) # Alternate category naming { Pearle::myLog(4, "Category date 'from' $1-$2-$3\n"); return ($1, $2, $3); } else { Pearle::myLog(4, "No date\n"); return (1, "January", 2007); } } # Return a list of upload dates sub getUploadDates { my @dates; my $image_text = shift; while($image_text =~ />\d\d?:\d\d, (\d\d?) (\w*) (\d\d\d\d)</g) { push @dates, [$1, $2, $3]; } return @dates; } sub getLastEditDate { my ($day, $month, $year); my $image = shift; my @history = Pearle::parseHistory($image); (undef, $day, $month, $year) = @{$history[0]}; return ($day, $month, $year); } # Find the most recent non-vandal, non-revert uploader sub GetImageUploader { my $image_data = shift; my ($uploader, $sha1, $comment); my @uploaders; my $uploader_data; my $i = 0; my $count = 0; my $parsed_xml = Pearle::getXMLParser()->XMLin($image_data); Pearle::myLog(4, Dumper($parsed_xml)); if(exists($parsed_xml->{query}->{pages}->{page}->{imageinfo}->{ii}) and defined($parsed_xml->{query}->{pages}->{page}->{imageinfo}->{ii})) { if(ref($parsed_xml->{query}->{pages}->{page}->{imageinfo}->{ii}) eq 'ARRAY') { @uploaders = @{$parsed_xml->{query}->{pages}->{page}->{imageinfo}->{ii}}; } else { return $parsed_xml->{query}->{pages}->{page}->{imageinfo}->{ii}->{user}; } } else { return undef; } $uploader = $uploaders[0]->{user}; $sha1 = $uploaders[0]->{sha1}; $comment = $uploaders[0]->{comment} || ""; my $done = 0; while(!$done) { if($comment =~ /^Reverted/) { Pearle::myLog(4, "Revert found\n"); $i += 1; while($uploaders[$i]->{sha1} ne $sha1) { $i = $i + 1; } } elsif($comment =~ /optimi(z|s)ed|adjust|tweak|scale|crop|change|resize|remove|reduc(e|ing)/i) { Pearle::myLog(4, "Tweak found\n"); $i = $i + 1; } elsif(!defined($uploader)) { Pearle::myLog(4, "Something went wrong with finding the uploader\n"); $done = 1; } elsif($count > 500) { Pearle::myLog(4, "Took too long finding the uploader\n"); $uploader = undef; $done = 1; } else { $done = 1; } $uploader = $uploaders[$i]->{user}; $sha1 = $uploaders[$i]->{sha1}; $comment = $uploaders[$i]->{comment} || ""; $count = $count + 1; } if(defined($uploader)) { Pearle::myLog(4, "Uploader: $uploader\n"); return $uploader; } else { return undef; } } sub loadNotificationList { my $file = shift; my %notelist; my $i = 0; notelog("File: $file\n"); open INFILE, "<:utf8", $file; while(<INFILE>) { $_ =~ s/\s*#.*$//g; chomp; $notelist{$_} = 1; $i++; } close INFILE; notelog("$i notifications loaded\n"); return %notelist; } sub saveNotificationList { return if($test_only); my $file = shift; my %notelist = @_; my $key; open OUTFILE, ">:utf8", $file; foreach $key (keys(%notelist)) { print OUTFILE "$key\n"; } close OUTFILE; } # Does a page transclude any of a set of templates? Template names must be in the canonical form, with prefix. sub usesTemplate { my $xml = shift; my @templates = @_; my $parsed_xml = ref($xml)?$xml:Pearle::getXMLParser()->XMLin($xml, ForceArray => ['tl']); Pearle::myLog(4, "Templates: " . join(", ", @templates) . "\n"); # Pearle::myLog(4, Dumper($parsed_xml)); if(!exists($parsed_xml->{query}->{pages}->{page}->{templates}->{tl}) or !defined($parsed_xml->{query}->{pages}->{page}->{templates}->{tl})) { Pearle::myLog(4, "No templates or no page\n"); return 0; } my $result = eval { my @used_templates = @{$parsed_xml->{query}->{pages}->{page}->{templates}->{tl}}; @used_templates = map {$_->{title}} @used_templates; Pearle::myLog(4, "Used: " . join(", ", @used_templates) . "\n"); Pearle::myLog(4, "Intersect: " . join(", ", Array::Utils::intersect( @templates, @used_templates )) . "\n"); if(Array::Utils::intersect( @templates, @used_templates )) { return 1; } else { return 0; } }; if($@) { # Probably more than one page in the xml print "usesTemplate error: $@"; return 0; } return $result; } sub DoesPageExist { my $page = shift; my $xml = Pearle::APIQuery(titles => [$page], prop => ['info']); # TODO: Handle API query errors ('undef' return values) if($xml =~ /missing=""/) { return 0; } return 1; } 1; </nowiki>