User:WatchlistBot/source.py: Difference between revisions
Appearance
Content deleted Content added
m bug fix, allow multiple project updates without requiring all |
some new code, some improvements |
||
Line 130: | Line 130: | ||
self.portalsTalk = dict.fromkeys(self.portalsTalk).keys() |
self.portalsTalk = dict.fromkeys(self.portalsTalk).keys() |
||
self.portalsTalk.sort() |
self.portalsTalk.sort() |
||
def organizeCategories (self, topLevel = "Numismatics"): |
|||
""" |
|||
organize the categories hierarchically |
|||
""" |
|||
cat = catlib.Category(wikipedia.getSite(), "Category:" + topLevel) |
|||
text = "[[:Category:"+topLevel+"]]<br>\n" |
|||
text = self.organizeCatsNextLevel(text, cat, "|—") |
|||
return text |
|||
def organizeCatsNextLevel (self, text, cat, substring): |
|||
""" |
|||
recursively organize the category text |
|||
text is the text so far, add to that |
|||
cat is the catlib.Category of the previous level |
|||
substring is the text to put before each category |
|||
""" |
|||
subcats = cat.subcategories() |
|||
for subcat in subcats: |
|||
# if this subcategory is included in our project |
|||
if (":"+subcat.title() in self.categories): |
|||
# if it has not already been listed (to prevent duplication, |
|||
# but more importantly, to prevent infinite loops) |
|||
if (text.find(subcat.title()) == -1): |
|||
text += substring + "[[:" + subcat.title() + "]]<br>\n" |
|||
text = self.organizeCatsNextLevel(text, subcat, "| "+substring) |
|||
else: # it's already been listed |
|||
text += substring + "[[:" + subcat.title() + "]] (already included, see above)<br>\n" |
|||
# don't recurse in this case, to prevent infinite loops |
|||
return text |
|||
def getTaggedPages (self): |
def getTaggedPages (self): |
||
Line 147: | Line 177: | ||
# remove duplicates and sort the lists |
# remove duplicates and sort the lists |
||
self.removeDuplicatesAndSort() |
self.removeDuplicatesAndSort() |
||
# organize the categories hierarchically (actually, no -- this takes too |
|||
# much time) |
|||
#self.catText = self.organizeCategories() |
|||
def getPagesFromTaggedCategories (self): |
def getPagesFromTaggedCategories (self): |
||
Line 168: | Line 202: | ||
# remove duplicates and sort the lists |
# remove duplicates and sort the lists |
||
self.removeDuplicatesAndSort() |
self.removeDuplicatesAndSort() |
||
# organize the categories hierarchically (actually, no -- this takes too |
|||
# much time) |
|||
#self.catText = self.organizeCategories() |
|||
def writeList (self, taggedPagesFlag): |
def writeList (self, taggedPagesFlag): |
||
Line 183: | Line 221: | ||
wikipedia.output(u"Preparing output") |
wikipedia.output(u"Preparing output") |
||
output = self.project.replace(" ", "_") + " |
output = self.project.replace(" ", "_") + "/" + \ |
||
self.articleOut.replace(" ", "_") |
self.articleOut.replace(" ", "_") |
||
Line 512: | Line 550: | ||
f.close() |
f.close() |
||
def untagPage (pageName, tag): |
def untagPage (pageName, tag, params = ""): |
||
""" |
""" |
||
remove the tag from the given talk page, if it is there |
remove the tag from the given talk page, if it is there |
||
params is an optional list of parameters for the tag |
|||
""" |
""" |
||
page = wikipedia.Page(wikipedia.getSite(), pageName) |
page = wikipedia.Page(wikipedia.getSite(), pageName) |
||
Line 526: | Line 565: | ||
writePage(page, text, "Removing " + tag) |
writePage(page, text, "Removing " + tag) |
||
def tagPage (pageName, tag): |
def tagPage (pageName, tag, params = ""): |
||
""" |
""" |
||
tag the given talk page with the tag |
tag the given talk page with the tag |
||
params is an optional list of parameters for the tag (like class=Stub) |
|||
""" |
""" |
||
# get the talk page |
# get the talk page |
||
Line 535: | Line 575: | ||
if not page.isRedirectPage(): |
if not page.isRedirectPage(): |
||
text = page.get() |
text = page.get() |
||
tagIt(page, text, tag) |
tagIt(page, text, tag+params) |
||
else: |
else: |
||
wikipedia.output("Page " + page.title() + " is a redirect") |
wikipedia.output("Page " + page.title() + " is a redirect") |
||
else: |
else: |
||
# we don't mind if the page doesn't exist yet, just create it |
# we don't mind if the page doesn't exist yet, just create it |
||
tagIt(page, "", tag) |
tagIt(page, "", tag+params) |
||
def tagIt (page, text, tag): |
def tagIt (page, text, tag): |
||
Line 592: | Line 632: | ||
def updateCategoryList (catList, catName, |
def updateCategoryList (catList, catName, taggedCats, |
||
keywords, |
|||
questionText = u"Do you want to tag ", confirm = True): |
|||
""" |
""" |
||
if catList starts with "", it means we're trying to quit, so just return |
|||
add the given category to the given category list |
|||
starting at catName, make a list, catList, of all subcategories |
|||
ask the user first, and allow the user the choice to recurse |
ask the user first, and allow the user the choice to recurse |
||
through subcategories |
through subcategories |
||
taggedCats is the list of categories that are already tagged and can thus |
|||
be skipped |
|||
keywords are words that if they're in the category, it will be tagged |
keywords are words that if they're in the category, it will be tagged |
||
without confirmation |
without confirmation |
||
if confirm is false, no confirmation question will be asked (all will be |
|||
included) |
|||
""" |
""" |
||
# check if we're quitting |
|||
if (len(catList) > 1 and catList[0] == ""): |
|||
return catList |
|||
cat = catlib.Category(wikipedia.getSite(), "Category:" + catName) |
cat = catlib.Category(wikipedia.getSite(), "Category:" + catName) |
||
response = "n" |
response = "n" |
||
if (catName not in catList): |
if (catName not in catList): |
||
# if the categories is already in the taggedCats, treat that like a |
|||
# "y" from the user |
|||
if ("Category:"+catName in taggedCats): |
|||
response = "y" |
|||
# if the name has a keyword in it, treat that like a "y" from the user |
|||
for keyword in keywords: |
for keyword in keywords: |
||
if (keyword in catName): |
if (keyword in catName): |
||
response = "y" |
response = "y" |
||
# if we haven't found a keyword, ask the user |
# if we haven't found a keyword, ask the user (but if confirm is False, |
||
# treat it as if the user already said yes) |
|||
if (confirm == False): |
|||
response = "y" |
|||
if (response == "n"): |
if (response == "n"): |
||
response = wikipedia.input( |
response = wikipedia.input(questionText + cat.title() + u"? (y for yes, yn for yes but no recursion, s for stop recursion)") |
||
if (response == "s"): |
|||
# put "" into the catlist at the beginning as a marker |
|||
catList.insert(0, "") |
|||
return catList |
|||
# add the category to the list |
# add the category to the list |
||
Line 621: | Line 686: | ||
for subcat in subcats: |
for subcat in subcats: |
||
updateCategoryList(catList, subcat.titleWithoutNamespace(), |
updateCategoryList(catList, subcat.titleWithoutNamespace(), |
||
keywords) |
taggedCats, keywords, questionText, confirm) |
||
return catList |
|||
def tagCategories (catName, tag |
def tagCategories (catName = "Baseball", tag = "Baseball-WikiProject", |
||
params = "|class=NA", |
|||
keywords = ["Baseball", "baseball", "field personnel", "players", |
|||
"managers", "coaches", "World Series"]): |
|||
""" |
""" |
||
tag all categories in the specified category and subcategories with the |
tag all categories in the specified category and subcategories with the |
||
Line 632: | Line 701: | ||
""" |
""" |
||
wikipedia.put_throttle.setDelay(10, absolute = True) |
wikipedia.put_throttle.setDelay(10, absolute = True) |
||
# get the list of categories which are already tagged |
|||
taggedCatList = [] |
|||
taggedArticleList = [] |
|||
getTagged(tag, taggedCatList, taggedArticleList) |
|||
# get the category list |
# get the category list |
||
catList = [] |
catList = [] |
||
updateCategoryList(catList, catName, keywords) |
catList = updateCategoryList(catList, catName, taggedCatList, keywords) |
||
# if the first element of catList is "", remove it, it was just a marker |
|||
catList.remove("") |
|||
# remove duplicates and sort |
# remove duplicates and sort |
||
catList = dict.fromkeys(catList).keys() |
catList = dict.fromkeys(catList).keys() |
||
Line 641: | Line 719: | ||
# remove categories which are already tagged |
# remove categories which are already tagged |
||
taggedCatList = [] |
|||
taggedArticleList = [] |
|||
getTagged(tag, taggedCatList, taggedArticleList) |
|||
for cat in catList: |
for cat in catList: |
||
if (not "Category:"+cat in taggedCatList): |
if (not "Category:"+cat in taggedCatList): |
||
tagPage("Category talk:" + cat, tag) |
tagPage("Category talk:" + cat, tag, params) |
||
return catList |
|||
def getTagged (tag, catList, articles): |
def getTagged (tag, catList, articles): |
||
Line 681: | Line 754: | ||
def addNA (catName="Unassessed numismatic articles", tag="Numismaticnotice"): |
def addNA (catName="Unassessed numismatic articles", tag="Numismaticnotice"): |
||
""" |
""" |
||
add "class=NA" to all |
add "|class=NA" to all articles in the specified category which are not |
||
in the article namespace (categories, images, etc) |
|||
""" |
""" |
||
articles = [] |
articles = [] |
||
findArticlesInCategory("Category:"+catName, articles, False) |
findArticlesInCategory("Category:"+catName, articles, False) |
||
articlesToTag = [] |
|||
for article in articles: |
for article in articles: |
||
# if this is not in the main project namespace |
# if this is not in the main project namespace |
||
if (article.find("Talk:") == -1): |
if (article.find("Talk:") == -1): |
||
articlesToTag.append(article) |
|||
addParams(articlesToTag, "|class=NA", tag, "Numismatics assessment, adding class=NA") |
|||
text = page.get() |
|||
text = wikipedia.replaceExceptMathNowikiAndComments( |
|||
def addParams (firstCat = "Unassessed Louisville articles", |
|||
text, "{{"+tag+"}}", "{{"+tag+"|class=NA}}") |
|||
secondCat = "Louisville stubs", |
|||
recurse = True, |
|||
params = "|class=Stub", |
|||
tag = "WikiProject Louisville", |
|||
comment = "Louisville assessment, adding class=Stub"): |
|||
""" |
|||
find the articles in the intersection of firstCat and secondCat |
|||
if recurse is true, include all subcats of secondCat (but not firstCat) |
|||
params is the text to add to the template |
|||
tag is the name of the template tag |
|||
comment is the text to use for the comment when saving |
|||
""" |
|||
# get the list of articles in the first category |
|||
firstArticles = [] |
|||
findArticlesInCategory("Category:"+firstCat, firstArticles, False) |
|||
print str(len(firstArticles)) |
|||
# get the list of articles in the second category |
|||
secondCatList = [] |
|||
secondCatList = updateCategoryList(secondCatList, secondCat, [], [], |
|||
"Do you want to include ", False) |
|||
secondArticles = [] |
|||
for cat in secondCatList: |
|||
findArticlesInCategory("Category:"+cat, secondArticles, False) |
|||
print str(len(secondArticles)) |
|||
# get the list of articles that is in both |
|||
articles = [] |
|||
for article in firstArticles: |
|||
if (article in secondArticles): |
|||
articles.append(article) |
|||
print str(len(articles)) |
|||
addParams(articles, params, tag, comment) |
|||
def addParams (articles, params, tag, comment): |
|||
""" |
|||
articles is the list of articles to change |
|||
params is the text to add to the template |
|||
tag is the name of the template tag |
|||
comment is the text to use for the comment when saving |
|||
""" |
|||
for article in articles: |
|||
page = wikipedia.Page(wikipedia.getSite(), article) |
|||
text = page.get() |
|||
text = wikipedia.replaceExceptMathNowikiAndComments( |
|||
text, tag, tag+params) |
|||
writePage(page, text, comment) |
|||
def replaceTag (oldTag="LouisvilleWikiProject", newTag="WikiProject Louisville"): |
|||
""" |
|||
replace the oldTag with the newTag (can be used to replace a tag with |
|||
a tag plus parameters |
|||
""" |
|||
articles = [] |
|||
getTagged(oldTag, [], articles) |
|||
for article in articles: |
|||
page = wikipedia.Page(wikipedia.getSite(), article) |
|||
text = page.get() |
|||
text = wikipedia.replaceExceptMathNowikiAndComments( |
|||
text, oldTag, newTag) |
|||
writePage(page, text, "replacing " + oldTag + " with " + newTag) |
|||
def tag (tag = " |
def tag (tag = "WikiProject Hawaii", params = "", otherTag = "", |
||
project = " |
project = "Hawaii", confirm=False, catList = []): |
||
""" |
""" |
||
tag articles in tagged categories |
tag articles in tagged categories |
||
if a page is already tagged with otherTag, skip it ( |
if a page is already tagged with otherTag, skip it (use otherTag = "" for none) |
||
catList is a list of categories to check in. If empty, use tagged categories |
catList is a list of categories to check in. If empty, use tagged categories |
||
if params is given, include it after the tag, when tagging an article |
|||
""" |
""" |
||
# get the list of all tagged articles in taggedArticles |
|||
# if catList was given, leave it as is. Otherwise, populate catList with |
|||
# all tagged categories |
|||
taggedArticles = [] |
taggedArticles = [] |
||
if (len(catList) == 0): |
if (len(catList) == 0): |
||
Line 709: | Line 852: | ||
dummy = [] |
dummy = [] |
||
getTagged(tag, dummy, taggedArticles) |
getTagged(tag, dummy, taggedArticles) |
||
# put "Category:" in front of the category names |
|||
print len(taggedArticles) |
|||
newCatList = [] |
|||
for cat in catList: |
|||
newCatList.append("Category:"+cat) |
|||
catList = newCatList |
|||
# add the articles tagged with otherTag to the list of taggedArticles |
# add the articles tagged with otherTag to the list of taggedArticles |
||
Line 715: | Line 862: | ||
getTagged(otherTag, [], taggedArticles) |
getTagged(otherTag, [], taggedArticles) |
||
# get the list of untagged articles in the categories in catList (which |
|||
# was either supplied as a parameter, or was populated with tagged categories) |
|||
untaggedArticles = [] |
untaggedArticles = [] |
||
for cat in catList: |
for cat in catList: |
||
Line 722: | Line 871: | ||
untaggedArticles = dict.fromkeys(untaggedArticles).keys() |
untaggedArticles = dict.fromkeys(untaggedArticles).keys() |
||
untaggedArticles.sort() |
untaggedArticles.sort() |
||
print len(untaggedArticles) |
|||
# make a list of articles that need to be tagged (by removing articles |
# make a list of articles that need to be tagged (by removing articles |
||
Line 732: | Line 880: | ||
# remove excluded articles |
# remove excluded articles |
||
excluded = getExcludedArticles(project) |
excluded = getExcludedArticles(project) |
||
print excluded |
|||
for page in excluded: |
for page in excluded: |
||
if (page in untaggedArticles): |
if (page in untaggedArticles): |
||
Line 739: | Line 888: | ||
wikipedia.output(u"No untagged articles") |
wikipedia.output(u"No untagged articles") |
||
print "Tagging " + str(len(untaggedArticles)) + " articles" |
|||
# tag the articles |
# tag the articles |
||
for article in untaggedArticles: |
for article in untaggedArticles: |
||
tagPage(article, tag) |
tagPage(article, tag, params) |
||
wikipedia.stopme() |
wikipedia.stopme() |
||
projects = ["Numismatics", "Numismatics", "Hawaii", "Texas", "Ice Hockey", |
projects = ["Numismatics", "Numismatics", "Hawaii", "Texas", "Ice Hockey", |
||
"Louisville", "Kentucky", "Texas State Highways", " |
"Louisville", "Kentucky", "Texas State Highways", "Dallas", |
||
"Cricket"] |
"Comics", "Pittsburgh", "Baseball", "Automobiles", "Cricket"] |
||
def listProjects (): |
def listProjects (): |
||
Line 764: | Line 914: | ||
"WikiProject Texas", "Ice hockey", "WikiProject Louisville", |
"WikiProject Texas", "Ice hockey", "WikiProject Louisville", |
||
"WikiProject Kentucky", "Texas State Highway WikiProject", |
"WikiProject Kentucky", "Texas State Highway WikiProject", |
||
" |
"WikiProject Dallas", "comicsproj", "PittsburghWikiProject", |
||
"Baseball-WikiProject", "AutomobileWatch", "CricketWatch"] |
|||
articleOuts = ["Articles", "Exonumia articles", "Hawaii recent changes", |
articleOuts = ["Articles", "Exonumia articles", "Hawaii recent changes", |
||
"Articles", "Articles", "Watchall", "Watchall", "Watchlist", |
"Articles", "Articles", "Watchall", "Watchall", "Watchlist", |
||
"Articles", "Articles" |
"Articles", "Articles", "Articles", "Articles", "Articles", |
||
"Articles"] |
|||
# pages to include even though they aren't tagged |
# pages to include even though they aren't tagged |
||
includePagesLists = [["Template:AfricanCurrencies", "Template:AmericanCurrencies", |
includePagesLists = [["Template:AfricanCurrencies", "Template:AmericanCurrencies", |
||
"Template:AsianCurrencies", "Template:EuropeanCurrencies"], |
"Template:AsianCurrencies", "Template:EuropeanCurrencies"], |
||
[], [], [], [], [], [], [], [], []] |
[], [], [], [], [], [],[], [], [], [], [], [], []] |
||
# true if we're getting tagged articles, false if we're getting articles |
# true if we're getting tagged articles, false if we're getting articles |
||
# in tagged categories |
# in tagged categories |
||
taggedPagesFlags = [True, True, True, True, True, True, True, True, |
taggedPagesFlags = [True, True, True, True, True, True, True, True, True, |
||
True, True, True, False, False] |
|||
if (len(projectNums) == 0): |
if (len(projectNums) == 0): |
Revision as of 04:59, 20 January 2007
import catlib import wikipedia import codecs # the maximum number of articles per page MAX = 10000 # should we write to file or directly to wikipedia? DBG = False class Watchlist: # the name of the template used to tag articles, e.g., "Numismaticnotice" template = "" # the name of the project, e.g., "Numismatics" project = "" # the location of the article list (output) -- without prefix, so for # "Wikipedia:WikiProject Numismatics/Articles", use "Articles" articleOut = "" # a list for all articles articles = [] # a list for all article talk pages articlesTalk = [] # a list for all Wikipedia pages wikis = [] # a list for all Wikipedia talk pages wikisTalk = [] # a list for all templates templates = [] # a list for all template talk pages templatesTalk = [] # a list for all categories categories = [] # a list for all category talk pages categoriesTalk = [] # a list for all images images = [] # a list for all image talk pages imagesTalk = [] # a list for all portals portals = [] # a list for all portal talk pages portalsTalk = [] # certain pages need to be included explicitly (for example, if they share # a talk page) includePages = [] def __init__(self, template, project, articleOut, includePages = []): self.template = template self.project = project self.articleOut = articleOut self.articles = [] self.articlesTalk = [] self.wikis = [] self.wikisTalk = [] self.templates = [] self.templatesTalk = [] self.categories = [] self.categoriesTalk = [] self.images = [] self.imagesTalk = [] self.portals = [] self.portalsTalk = [] self.includePages = includePages def processPageName (self, name): """ Process one page name, updating the lists as appropriate. """ result = name.split(":") if (len(result) == 1): self.articles.append(result[0]) self.articlesTalk.append("Talk:"+result[0]) elif (result[0] == "Talk"): self.articles.append(result[1]) self.articlesTalk.append("Talk:"+result[1]) elif (result[0] == "Wikipedia talk" or result[0] == "Wikipedia"): self.wikis.append("Wikipedia:"+result[1]) self.wikisTalk.append("Wikipedia talk:"+result[1]) elif (result[0] == "Template talk" or result[0] == "Template"): self.templates.append("Template:"+result[1]) self.templatesTalk.append("Template talk:"+result[1]) elif (result[0] == "Category talk" or result[0] == "Category"): self.categories.append(":Category:"+result[1]) self.categoriesTalk.append("Category talk:"+result[1]) elif (result[0] == "Image talk" or result[0] == "Image"): self.images.append(":Image:"+result[1]) self.imagesTalk.append("Image talk:"+result[1]) elif (result[0] == "Portal talk" or result[0] == "Portal"): self.portals.append("Portal:"+result[1]) self.portalsTalk.append("Portal talk:"+result[1]) def scanCat (self, catName, recurse): cat = catlib.Category(wikipedia.getSite(), catName) pages = cat.articles(recurse) for page in pages: self.processPageName(page.title()) self.categories.append(":Category:"+catName) self.categoriesTalk.append("Category talk:"+catName) def removeDuplicatesAndSort (self): self.articles = dict.fromkeys(self.articles).keys() self.articles.sort() self.articlesTalk = dict.fromkeys(self.articlesTalk).keys() self.articlesTalk.sort() self.wikis = dict.fromkeys(self.wikis).keys() self.wikis.sort() self.wikisTalk = dict.fromkeys(self.wikisTalk).keys() self.wikisTalk.sort() self.templates = dict.fromkeys(self.templates).keys() self.templates.sort() self.templatesTalk = dict.fromkeys(self.templatesTalk).keys() self.templatesTalk.sort() self.categories = dict.fromkeys(self.categories).keys() self.categories.sort() self.categoriesTalk = dict.fromkeys(self.categoriesTalk).keys() self.categoriesTalk.sort() self.images = dict.fromkeys(self.images).keys() self.images.sort() self.imagesTalk = dict.fromkeys(self.imagesTalk).keys() self.imagesTalk.sort() self.portals = dict.fromkeys(self.portals).keys() self.portals.sort() self.portalsTalk = dict.fromkeys(self.portalsTalk).keys() self.portalsTalk.sort() def organizeCategories (self, topLevel = "Numismatics"): """ organize the categories hierarchically """ cat = catlib.Category(wikipedia.getSite(), "Category:" + topLevel) text = "[[:Category:"+topLevel+"]]<br>\n" text = self.organizeCatsNextLevel(text, cat, "|—") return text def organizeCatsNextLevel (self, text, cat, substring): """ recursively organize the category text text is the text so far, add to that cat is the catlib.Category of the previous level substring is the text to put before each category """ subcats = cat.subcategories() for subcat in subcats: # if this subcategory is included in our project if (":"+subcat.title() in self.categories): # if it has not already been listed (to prevent duplication, # but more importantly, to prevent infinite loops) if (text.find(subcat.title()) == -1): text += substring + "[[:" + subcat.title() + "]]<br>\n" text = self.organizeCatsNextLevel(text, subcat, "| "+substring) else: # it's already been listed text += substring + "[[:" + subcat.title() + "]] (already included, see above)<br>\n" # don't recurse in this case, to prevent infinite loops return text def getTaggedPages (self): """ Get the pages that include templateName Add the articles to the appropriate lists """ page = wikipedia.Page(wikipedia.getSite(), "Template:" + self.template) refs = page.getReferences(onlyTemplateInclusion=True) for page in refs: self.processPageName(page.title()) # include the explicitly named pages for page in self.includePages: self.processPageName(page) # remove duplicates and sort the lists self.removeDuplicatesAndSort() # organize the categories hierarchically (actually, no -- this takes too # much time) #self.catText = self.organizeCategories() def getPagesFromTaggedCategories (self): page = wikipedia.Page(wikipedia.getSite(), "Template:" + self.template) refs = page.getReferences(onlyTemplateInclusion=True) # include the explicitly named pages articles = [] for page in refs: result = page.title().split(":") if (result[0] == "Category talk"): # we expect this findArticlesInCategory("Category:" + result[1], articles) # add the category to the list as well articles.append(page.title()) articles = dict.fromkeys(articles).keys() articles.sort() for page in articles: self.processPageName(page) # remove duplicates and sort the lists self.removeDuplicatesAndSort() # organize the categories hierarchically (actually, no -- this takes too # much time) #self.catText = self.organizeCategories() def writeList (self, taggedPagesFlag): """ write the output to the specified page on Wikipedia taggedPagesFlag tells whether we're looking for tagged pages (true) or tagged categories (false) """ tagText = "" if (not taggedPagesFlag): tagText = "in categories " # the output page, without spaces wikipedia.output(u"Preparing output") output = self.project.replace(" ", "_") + "/" + \ self.articleOut.replace(" ", "_") totalArticles = len(self.articles) + len(self.wikis) + \ len(self.templates) + len(self.categories) + \ len(self.images) + len(self.portals) mainText = "<div class=\"notice\" " + \ "style=\"background:#ffe1a7; border:1px solid #AAA; " + \ "padding:0.2em; margin:0.5em auto;\"> " + \ "[[Image:Stop_hand.svg|left|20px]] This page is automatically " + \ "recreated from time to time. Accordingly, any changes you " + \ "make here will be overwitten. See below for details.</div>\n\n" # double the number of articles because of talk pages splitting = (totalArticles*2 > MAX) if (splitting): mainText += "There are too many articles in this project to list " + \ "them all on one page. This article contains the first " + \ str(MAX) + " articles and links to other articles which " + \ "contain " else: mainText += "This article contains " mainText += "links to all articles, categories, images, portal pages " + \ "templates, and project pages " + tagText + "with {{tl|" + \ self.template + "}} on their talk page. It was " + \ "generated by [[User:WatchlistBot|" + \ "WatchlistBot]]. Its purpose is to be able to track " + \ "the project history using ''[[Special:Recentchangeslinked/" + \ "Wikipedia:WikiProject " + output + \ "|related changes]]'' or ''[http://tools.wikimedia.de/~interiot/" + \ "cgi-bin/offtoolserver/RC_firstonly?url=http%3A%2F%2Fen.wikipedia.org" + \ "%2Fw%2Findex.php%3Ftitle%3DSpecial%3ARecentchangeslinked%26target" + \ "%3DWikipedia:WikiProject_" + output + \ "%26hideminor%3D0%26days%3D7%26limit%3D500 related watchlist]'' which " + \ "only shows the last change for each article.\n" + \ "\n" mainText += "==Regular content (count: " + str(totalArticles) + ")==\n" # the number of articles listed on this page count = 0 # the page number pageNo = 1 # the text for this subpage (if no subpages, will just be on the main # page) mainText += "===Articles (count: " + str(len(self.articles)) + ")===\n" prevChar = firstChar = "Z" #initialize to anything but A subText = "" # make sure the first batch of articles goes to the main page firstBatch = True for s in self.articles: if (s[0] != prevChar): subText += "====" + s[0] + "====\n" prevChar = s[0] if (count == 0): firstChar = prevChar subText += "*[[" + s + "]]\n" count = count+1 if (count > MAX): count = 0 if (firstBatch): firstBatch = False mainText += subText else: mainText += "*[[/Page" + str(pageNo) + "|" + \ firstChar + "-" + prevChar + "]]\n" subText = subText.replace("<range>", firstChar + " through " + \ prevChar) self.writeProjPage(self.articleOut + "/Page" + str(pageNo), subText) pageNo = pageNo+1 firstChar = prevChar subText = "===Articles <range>===\n" + \ "====" + prevChar + "====\n" if (splitting and not firstBatch): mainText += "*[[/Page" + str(pageNo) + "|" + \ firstChar + " through " + prevChar + "]]\n" subText = subText.replace("<range>", firstChar + " through " + prevChar) self.writeProjPage(self.articleOut + "/Page" + str(pageNo), subText) pageNo = pageNo+1 else: mainText += subText mainText += "===Wikipedia (count: " + str(len(self.wikis)) + ")===\n" if (splitting): subText = "This article contains links to templates, categories, portals, " + \ "and images " + tagText + "with {{tl|" + self.template + "}} " + \ "on their talk page. It was generated by [[User:WatchlistBot|" + \ "WatchlistBot]]. Its purpose is to be able to track " + \ "the project history using ''[[Special:Recentchangeslinked/" + \ "Wikipedia:WikiProject " + output + \ "/Page" + str(pageNo) + "|related changes]]'' or ''[http://tools.wikimedia.de/~interiot/" + \ "cgi-bin/offtoolserver/RC_firstonly?url=http%3A%2F%2Fen.wikipedia.org" + \ "%2Fw%2Findex.php%3Ftitle%3DSpecial%3ARecentchangeslinked%26target" + \ "%3DWikipedia:WikiProject_" + output + \ "/Page" + str(pageNo) + "%26hideminor%3D0%26days%3D7%26limit%3D500 related watchlist]'' which " + \ "only shows the last change for each article.\n" + \ "\n" + \ "===Wikipedia===\n" mainText += "*[[/Page" + str(pageNo) + "#Wikipedia|Wikipedia]]\n" else: subText = "" for s in self.wikis: subText += "*[[" + s + "]]\n" if (not splitting): mainText += subText subText = "" mainText += "===Templates (count: " + str(len(self.templates)) + ")===\n" if (splitting): subText += "===Templates===\n" mainText += "*[[/Page" + str(pageNo) + "#Templates|Templates]]\n" for s in self.templates: subText += "*[[" + s + "]]\n" if (not splitting): mainText += subText subText = "" mainText += "===Categories (count: " + str(len(self.categories)) + ")===\n" if (splitting): subText += "===Categories===\n" mainText += "*[[/Page" + str(pageNo) + "#Categories|Categories]]\n" for s in self.categories: subText += "*[[" + s + "]]\n" if (not splitting): mainText += subText subText = "" mainText += "===Portals (count: " + str(len(self.portals)) + ")===\n" if (splitting): subText += "===Portals===\n" mainText += "*[[/Page" + str(pageNo) + "#Portals|Portals]]\n" for s in self.portals: subText += "*[[" + s + "]]\n" if (not splitting): mainText += subText subText = "" mainText += "===Images (count: " + str(len(self.images)) + ")===\n" if (splitting): subText += "===Images===\n" mainText += "*[[/Page" + str(pageNo) + "#Images|Images]]\n" for s in self.images: subText += "*[[" + s + "]]\n" if (not splitting): mainText += subText subText = "" if (splitting): self.writeProjPage(self.articleOut + "/Page" + str(pageNo), subText) pageNo = pageNo+1 mainText += "==Talk pages==\n" mainText += "===Articles===\n" prevChar = firstChar = "Z" #initialize to anything but A if (splitting): subText = "This article contains links to some talk pages " + tagText + \ "with {{tl|" + self.template + "}} " + \ "on their talk page. It was generated by [[User:WatchlistBot|" + \ "WatchlistBot]]. Its purpose is to be able to track " + \ "the project history using ''[[Special:Recentchangeslinked/" + \ "Wikipedia:WikiProject " + output + \ "/Page" + str(pageNo) + "|related changes]]'' or ''[http://tools.wikimedia.de/~interiot/" + \ "cgi-bin/offtoolserver/RC_firstonly?url=http%3A%2F%2Fen.wikipedia.org" + \ "%2Fw%2Findex.php%3Ftitle%3DSpecial%3ARecentchangeslinked%26target" + \ "%3DWikipedia:WikiProject_" + output + \ "/Page" + str(pageNo) + "%26hideminor%3D0%26days%3D7%26limit%3D500 related watchlist]'' which " + \ "only shows the last change for each article.\n" + \ "\n" + \ "===Articles <range>===\n" else: subText = "" count = 0 for s in self.articlesTalk: if (count == 0): firstChar = s.split(":")[1][0] subText += "*[[" + s + "]]\n" count = count+1 if (count > MAX): count = 0 endChar = s.split(":")[1][0] mainText += "*[[/Page" + str(pageNo) + "|" + \ firstChar + "-" + endChar + "]]\n" subText = subText.replace("<range>", firstChar + " through " + \ endChar) self.writeProjPage(self.articleOut + "/Page" + str(pageNo), subText) pageNo = pageNo+1 firstChar = endChar subText = "===Articles <range>===\n" if (splitting): endChar = s.split(":")[1][0] mainText += "*[[/Page" + str(pageNo) + "|" + \ firstChar + " through " + endChar + "]]\n" subText = subText.replace("<range>", firstChar + " through " + endChar) self.writeProjPage(self.articleOut + "/Page" + str(pageNo), subText) pageNo = pageNo+1 else: mainText += subText mainText += "===Wikipedia===\n" if (splitting): subText = "This article contains links to some talk pages " + tagText + \ "with {{tl|" + self.template + "}} " + \ "on their talk page. It was generated by [[User:WatchlistBot|" + \ "WatchlistBot]]. Its purpose is to be able to track " + \ "the project history using ''[[Special:Recentchangeslinked/" + \ "Wikipedia:WikiProject " + output + \ "/Page" + str(pageNo) + "|related changes]]'' or ''[http://tools.wikimedia.de/~interiot/" + \ "cgi-bin/offtoolserver/RC_firstonly?url=http%3A%2F%2Fen.wikipedia.org" + \ "%2Fw%2Findex.php%3Ftitle%3DSpecial%3ARecentchangeslinked%26target" + \ "%3DWikipedia:WikiProject_" + output + \ "/Page" + str(pageNo) + "%26hideminor%3D0%26days%3D7%26limit%3D500 related watchlist]'' which " + \ "only shows the last change for each article.\n" + \ "\n" + \ "===Wikipedia===\n" mainText += "*[[/Page" + str(pageNo) + "#Wikipedia|Wikipedia]]\n" else: subText = "" for s in self.wikisTalk: subText += "*[[" + s + "]]\n" if (not splitting): mainText += subText subText = "" mainText += "===Templates===\n" if (splitting): subText += "===Templates===\n" mainText += "*[[/Page" + str(pageNo) + "#Templates|Templates]]\n" for s in self.templatesTalk: subText += "*[[" + s + "]]\n" if (not splitting): mainText += subText subText = "" mainText += "===Categories===\n" if (splitting): subText += "===Categories===\n" mainText += "*[[/Page" + str(pageNo) + "#Categories|Categories]]\n" for s in self.categoriesTalk: subText += "*[[" + s + "]]\n" if (not splitting): mainText += subText subText = "" mainText += "===Portals===\n" if (splitting): subText += "===Portals===\n" mainText += "*[[/Page" + str(pageNo) + "#Portals|Portals]]\n" for s in self.portalsTalk: subText += "*[[" + s + "]]\n" if (not splitting): mainText += subText subText = "" mainText += "===Images===\n" if (splitting): subText += "===Images===\n" mainText += "*[[/Page" + str(pageNo) + "#Images|Images]]\n" for s in self.imagesTalk: subText += "*[[" + s + "]]\n" if (not splitting): mainText += subText subText = "" if (splitting): self.writeProjPage(self.articleOut + "/Page" + str(pageNo), subText) pageNo = pageNo+1 else: mainText += subText self.writeProjPage(self.articleOut, mainText) def writeProjPage (self, pageName, text): pageName = "Wikipedia:WikiProject " + self.project + "/" + pageName comment = "full update by [[User:WatchlistBot|WatchlistBot]]" page = wikipedia.Page(wikipedia.getSite(), pageName) writePage(page, text, comment) def getExcludedArticles (project): """ get the list of pages which should not be tagged even though they're in tagged categories """ page = wikipedia.Page(wikipedia.getSite(), "User:WatchlistBot/" + project) if (page.exists()): text = page.get() # find the "----" the list of articles is below the line start = text.find("----\n") result = text[start+4:].split("[[") pages = [] for page in result: end = page.find("]]") if (end != -1): pages.append(getTalkVersion(page[:end])) return pages return [] def getTalkVersion (name): """ given a page name, convert it to the associated talk page """ result = name.split(":") if (len(result) == 1): return "Talk:"+name if (result[0].find("Talk") != -1 or result[0].find("talk") != -1): return name return result[0] + " talk:" + result[1] def writePage (page, text, comment): if (not DBG): page.put(text, comment, minorEdit=False) else: pageName = page.title() start = pageName.find("/"); if (start != -1): pageName = pageName[start+1:] start = pageName.find(":"); if (start != -1): pageName = pageName[start+1:] ## page = wikipedia.Page(wikipedia.getSite(), ## "User:mom2jandk/" + pageName) ## page.put(text, comment, minorEdit=False) wikipedia.output(u"Writing file " + pageName + u".txt") f = codecs.open(pageName + ".txt", mode="w", encoding="utf8") f.write(text) f.close() def untagPage (pageName, tag, params = ""): """ remove the tag from the given talk page, if it is there params is an optional list of parameters for the tag """ page = wikipedia.Page(wikipedia.getSite(), pageName) if page.exists(): if not page.isRedirectPage(): text = page.get() if (text.find("{{"+tag+"))") == -1): wikipedia.output("Page " + page.title() + " not tagged") else: text = wikipedia.replaceExceptMathNowikiAndComments(text, "{{"+text+"}}", "") writePage(page, text, "Removing " + tag) def tagPage (pageName, tag, params = ""): """ tag the given talk page with the tag params is an optional list of parameters for the tag (like class=Stub) """ # get the talk page page = wikipedia.Page(wikipedia.getSite(), pageName) if page.exists(): if not page.isRedirectPage(): text = page.get() tagIt(page, text, tag+params) else: wikipedia.output("Page " + page.title() + " is a redirect") else: # we don't mind if the page doesn't exist yet, just create it tagIt(page, "", tag+params) def tagIt (page, text, tag): text = "{{" + tag + "}}\n\n" + text writePage(page, text, "Adding " + tag) def findArticlesInCategory (catName, articles, confirm = False): """ find all the articles in the given category, and return a list If confirm is true, check each article with the user articles is the list so far """ # get the category (don't include it, since tagging articles and categories # is handled separately) cat = catlib.Category(wikipedia.getSite(), catName) # get all pages in this category pages = cat.articles() for page in pages: # if confirming, check if (confirm): response = wikipedia.input(u"Do you want to tag " + page.title() + u"? (y for yes)") if (not confirm or response == "y"): # add the appropriate prefix if (page.namespace() == 10): # template articles.append("Template talk:" + page.titleWithoutNamespace()) elif (page.namespace() == 0): # article articles.append("Talk:" + page.title()) elif (page.namespace() == 6): # image articles.append("Image talk:" + page.titleWithoutNamespace()) elif (page.namespace() == 100): # portal articles.append("Portal talk:" + page.titleWithoutNamespace()) elif (page.namespace() == 4): # wikipedia articles.append("Wikipedia talk:" + page.titleWithoutNamespace()) elif (page.namespace() == 1 or # article talk page.namespace() == 5 or # wikipedia talk page.namespace() == 7 or # image talk page.namespace() == 11 or # template talk page.namespace() == 101): # portal talk articles.append(page.title()) elif (page.namespace() == 2 or # user page.namespace() == 3 or # user talk page.namespace() == 15): # category talk # ignore these (dummy command) x = 1 else: print "Unexpected namespace on " + page.title() + ": " + str(page.namespace()) #remove duplicates articles = dict.fromkeys(articles).keys() def updateCategoryList (catList, catName, taggedCats, keywords, questionText = u"Do you want to tag ", confirm = True): """ if catList starts with "", it means we're trying to quit, so just return starting at catName, make a list, catList, of all subcategories ask the user first, and allow the user the choice to recurse through subcategories taggedCats is the list of categories that are already tagged and can thus be skipped keywords are words that if they're in the category, it will be tagged without confirmation if confirm is false, no confirmation question will be asked (all will be included) """ # check if we're quitting if (len(catList) > 1 and catList[0] == ""): return catList cat = catlib.Category(wikipedia.getSite(), "Category:" + catName) response = "n" if (catName not in catList): # if the categories is already in the taggedCats, treat that like a # "y" from the user if ("Category:"+catName in taggedCats): response = "y" # if the name has a keyword in it, treat that like a "y" from the user for keyword in keywords: if (keyword in catName): response = "y" # if we haven't found a keyword, ask the user (but if confirm is False, # treat it as if the user already said yes) if (confirm == False): response = "y" if (response == "n"): response = wikipedia.input(questionText + cat.title() + u"? (y for yes, yn for yes but no recursion, s for stop recursion)") if (response == "s"): # put "" into the catlist at the beginning as a marker catList.insert(0, "") return catList # add the category to the list if (response == "y" or response == "yn"): catList.append(cat.titleWithoutNamespace()) # recurse through subcategories if (response == "y"): subcats = cat.subcategories() for subcat in subcats: updateCategoryList(catList, subcat.titleWithoutNamespace(), taggedCats, keywords, questionText, confirm) return catList def tagCategories (catName = "Baseball", tag = "Baseball-WikiProject", params = "|class=NA", keywords = ["Baseball", "baseball", "field personnel", "players", "managers", "coaches", "World Series"]): """ tag all categories in the specified category and subcategories with the specified tag (at the top of the page) check with the user for each category keywords are words that if they're in the category, it will be tagged without confirmation """ wikipedia.put_throttle.setDelay(10, absolute = True) # get the list of categories which are already tagged taggedCatList = [] taggedArticleList = [] getTagged(tag, taggedCatList, taggedArticleList) # get the category list catList = [] catList = updateCategoryList(catList, catName, taggedCatList, keywords) # if the first element of catList is "", remove it, it was just a marker catList.remove("") # remove duplicates and sort catList = dict.fromkeys(catList).keys() catList.sort() # remove categories which are already tagged for cat in catList: if (not "Category:"+cat in taggedCatList): tagPage("Category talk:" + cat, tag, params) def getTagged (tag, catList, articles): """ get a list of categories and articles which contain the specified tag """ page = wikipedia.Page(wikipedia.getSite(), "Template:" + tag) refs = page.getReferences(onlyTemplateInclusion=True) for page in refs: name = page.title() result = name.split(":") if (result[0] == "Category talk"): catList.append("Category:"+result[1]) else: articles.append(name) def untag (catList = [], tag = "Numismaticnotice"): """ remove the tag from all articles in the specified categories this is useful when the bot makes a mistake """ articles = [] for catName in catList: findArticlesInCategory("Category:"+catName, articles, False) articles = dict.fromkeys(articles).keys() articles.sort() for article in articles: untagPage(article, tag) wikipedia.stopme() def addNA (catName="Unassessed numismatic articles", tag="Numismaticnotice"): """ add "|class=NA" to all articles in the specified category which are not in the article namespace (categories, images, etc) """ articles = [] findArticlesInCategory("Category:"+catName, articles, False) articlesToTag = [] for article in articles: # if this is not in the main project namespace if (article.find("Talk:") == -1): articlesToTag.append(article) addParams(articlesToTag, "|class=NA", tag, "Numismatics assessment, adding class=NA") def addParams (firstCat = "Unassessed Louisville articles", secondCat = "Louisville stubs", recurse = True, params = "|class=Stub", tag = "WikiProject Louisville", comment = "Louisville assessment, adding class=Stub"): """ find the articles in the intersection of firstCat and secondCat if recurse is true, include all subcats of secondCat (but not firstCat) params is the text to add to the template tag is the name of the template tag comment is the text to use for the comment when saving """ # get the list of articles in the first category firstArticles = [] findArticlesInCategory("Category:"+firstCat, firstArticles, False) print str(len(firstArticles)) # get the list of articles in the second category secondCatList = [] secondCatList = updateCategoryList(secondCatList, secondCat, [], [], "Do you want to include ", False) secondArticles = [] for cat in secondCatList: findArticlesInCategory("Category:"+cat, secondArticles, False) print str(len(secondArticles)) # get the list of articles that is in both articles = [] for article in firstArticles: if (article in secondArticles): articles.append(article) print str(len(articles)) addParams(articles, params, tag, comment) def addParams (articles, params, tag, comment): """ articles is the list of articles to change params is the text to add to the template tag is the name of the template tag comment is the text to use for the comment when saving """ for article in articles: page = wikipedia.Page(wikipedia.getSite(), article) text = page.get() text = wikipedia.replaceExceptMathNowikiAndComments( text, tag, tag+params) writePage(page, text, comment) def replaceTag (oldTag="LouisvilleWikiProject", newTag="WikiProject Louisville"): """ replace the oldTag with the newTag (can be used to replace a tag with a tag plus parameters """ articles = [] getTagged(oldTag, [], articles) for article in articles: page = wikipedia.Page(wikipedia.getSite(), article) text = page.get() text = wikipedia.replaceExceptMathNowikiAndComments( text, oldTag, newTag) writePage(page, text, "replacing " + oldTag + " with " + newTag) def tag (tag = "WikiProject Hawaii", params = "", otherTag = "", project = "Hawaii", confirm=False, catList = []): """ tag articles in tagged categories if a page is already tagged with otherTag, skip it (use otherTag = "" for none) catList is a list of categories to check in. If empty, use tagged categories if params is given, include it after the tag, when tagging an article """ # get the list of all tagged articles in taggedArticles # if catList was given, leave it as is. Otherwise, populate catList with # all tagged categories taggedArticles = [] if (len(catList) == 0): getTagged(tag, catList, taggedArticles) else: dummy = [] getTagged(tag, dummy, taggedArticles) # put "Category:" in front of the category names newCatList = [] for cat in catList: newCatList.append("Category:"+cat) catList = newCatList # add the articles tagged with otherTag to the list of taggedArticles if (otherTag != ""): getTagged(otherTag, [], taggedArticles) # get the list of untagged articles in the categories in catList (which # was either supplied as a parameter, or was populated with tagged categories) untaggedArticles = [] for cat in catList: findArticlesInCategory(cat, untaggedArticles, confirm) # remove duplicates and sort untaggedArticles = dict.fromkeys(untaggedArticles).keys() untaggedArticles.sort() # make a list of articles that need to be tagged (by removing articles # that are already tagged from list of all articles) for article in taggedArticles: if (article in untaggedArticles): untaggedArticles.remove(article) # remove excluded articles excluded = getExcludedArticles(project) print excluded for page in excluded: if (page in untaggedArticles): untaggedArticles.remove(page) if (len(untaggedArticles) == 0): wikipedia.output(u"No untagged articles") print "Tagging " + str(len(untaggedArticles)) + " articles" # tag the articles for article in untaggedArticles: tagPage(article, tag, params) wikipedia.stopme() projects = ["Numismatics", "Numismatics", "Hawaii", "Texas", "Ice Hockey", "Louisville", "Kentucky", "Texas State Highways", "Dallas", "Comics", "Pittsburgh", "Baseball", "Automobiles", "Cricket"] def listProjects (): """ print out a list of active projects, with numbers to use for an individual update """ for proj in range(len(projects)): print(str(proj) + ": " + projects[proj]) def update (projectNums = []): """ update the project watchlists. If projectNum is given, only update the given project number (see projects for list, remember to start at 0) """ templates = ["Numismaticnotice", "Exonumianotice", "WikiProject Hawaii", "WikiProject Texas", "Ice hockey", "WikiProject Louisville", "WikiProject Kentucky", "Texas State Highway WikiProject", "WikiProject Dallas", "comicsproj", "PittsburghWikiProject", "Baseball-WikiProject", "AutomobileWatch", "CricketWatch"] articleOuts = ["Articles", "Exonumia articles", "Hawaii recent changes", "Articles", "Articles", "Watchall", "Watchall", "Watchlist", "Articles", "Articles", "Articles", "Articles", "Articles", "Articles"] # pages to include even though they aren't tagged includePagesLists = [["Template:AfricanCurrencies", "Template:AmericanCurrencies", "Template:AsianCurrencies", "Template:EuropeanCurrencies"], [], [], [], [], [], [],[], [], [], [], [], [], []] # true if we're getting tagged articles, false if we're getting articles # in tagged categories taggedPagesFlags = [True, True, True, True, True, True, True, True, True, True, True, True, False, False] if (len(projectNums) == 0): projectNums = range(len(templates)) for i in projectNums: template, project = templates[i], projects[i] articleOut, includePagesList = articleOuts[i], includePagesLists[i] taggedPagesFlag = taggedPagesFlags[i] print "Updating watchlist for: %s using template: %s. Saving to: %s" \ % (project, template, articleOut) wl = Watchlist(template, project, articleOut, includePagesList) if (taggedPagesFlag): wl.getTaggedPages() else: wl.getPagesFromTaggedCategories() wl.writeList(taggedPagesFlag) wikipedia.stopme()