Jump to content

Module:Internet Archive

Permanently protected module
From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by GreenC (talk | contribs) at 19:44, 6 October 2014 (Created page with '--[[ This module is for functions related to Internet Archive. Function author, invoked from Template:Internet Archive author {{Internet Archiv...'). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.
(diff) ← Previous revision | Latest revision (diff) | Newer revision → (diff)

--[[ 

    This module is for functions related to Internet Archive.

    Function author, invoked from Template:Internet Archive author

      {{Internet Archive author|name=|birth=|death=}}

      Return an optimized Internet Archive search URL for use in External links section eg.
           
                ==External links==
                * {{Internet Archive author|name=Albert Einstein|birth=1879|death=1955}}

      . All arguments are optional. 
          name  = Defaults to article title.
          birth = Defaults to #### in %[%[Category:#### births%]%]
          death = Defaults to #### in %[%[Category:#### deaths%]%]
 
      . Uses Protocol Relative URLs. See WP:PRURL.

      . The script produces an optimized Internet Archive search string. By way of background, Internet Archive metadata is inconsistent. Books are cataloged by thousands of independent entities (libraries, persons, etc). Each will choose to use some metadata fields or not others (of which IA has dozens). And they will fill out the fields differently, for example there are many ways to enter a 3-word name (John H. Smith, J.H. Smith, etc). Just entering a name into the IA search box will usually miss many books; I have seen the optimized search string find up to 3X as many books. The search strategy is based on the number of words in a name so that various combination possibilities can be built. The IA fields "subject" and "creator" are the most common and will usually find the majority of books. However "title" and "description" are also useful. Complicating matters, IA limits how long a search string can be so the below strategy breaks down after 3 word names due to limits at IA, 4+ word searches are by necessity relatively basic compared to a 3 word. 

      . Project Gutenberg is mirrored on Internet Archive. Since Wikipedia already uses {{Gutenberg author}}, those books are removed from the Internet Archive search results (-contributor%3Agutenberg).

      . LibriVox is mirrored there also. For the same reason audio results are not included. Questions: there are other audio/video works on IA besides LV, should those be included? Consider quality of works, cluttered results, readers expectations for books, other templates.

]]

local p = {}

function p.author(frame)

  local pframe = frame:getParent()
  local args = pframe.args
  local pagetext = nil
  local name = nil
  local birth = nil
  local death = nil

  --- Determine name
  if args.name == "" or args.name == nil then
    name = mw.title.getCurrentTitle().text
  else
    name = args.name
  end
  name = string.gsub(name,"%s%(.*%)", "") -- remove disambiguation () from title

  --- Determine dob
  if args.birth == "" or args.birth == nil then
   
    -- Load the page
    t = mw.title.getCurrentTitle()
    pagetext = t:getContent()
    if pagetext == nil then 
      return ""
    end
 
    -- Remove false positives
    pagetext = mw.ustring.gsub( mw.ustring.gsub(pagetext, "<!--.--->", ""), "<nowiki>.-</nowiki>", "")
 
    -- Scrape for the Category and find date
    local birthcheck = mw.ustring.match(pagetext, "%[%[%s-[Cc]ategory:%s-%d+%.?%d*%s-births%s-%]%]" )
    if birthcheck ~= nil then
      birth = mw.ustring.match(birthcheck, "%d+%.?%d*")
    else
      birth = "none"
    end
  else
    birth = string.gsub(args.birth, " ", "")
  end
  
  --Determine dod
  if args.death == "" or args.death == nil then

    -- Load the page
    if pagetext == nil then -- don't load again if already done above
      t = mw.title.getCurrentTitle()
      pagetext = t:getContent()
      if pagetext == nil then 
        return ""
      end
      -- Remove false positives
      pagetext = mw.ustring.gsub( mw.ustring.gsub(pagetext, "<!--.--->", ""), "<nowiki>.-</nowiki>", "")
    end

    -- Scrape for the Category and find date
    local deathcheck = mw.ustring.match(pagetext, "%[%[%s-[Cc]ategory:%s-%d+%.?%d*%s-deaths%s-%]%]" )
    if deathcheck ~= nil then
      death = mw.ustring.match(deathcheck, "%d+%.?%d*")
    else
      death = "none"
    end
  else
    death = string.gsub(args.death, " ", "")
  end

  --- Static status strings
  local byabout = "Works by or about"
  local tagline = "at [[Internet Archive]] (scanned books original editions color illustrated)"

  --- Split name into words and count words
  exploded = split(name, " ")
  l, count = string.gsub(name, "%S+", "")
  
  --[[ 

      Begin formatting URL

  ]]

  -- If no dob and dod, return a simple search
  if birth == "none" or death == "none" then

    nameurl = url_encode(name)
    return "[//archive.org/search.php?query=mediatype%3A(texts)%20-contributor%3Agutenberg%20AND%20(subject%3A%22"..nameurl.."%22%20OR%20creator%3A%22"..nameurl.."%22%20) "..byabout.." "..name.."] "..tagline

  else -- Optimized search based on number of words

    -- One or Five+ words search string
    if count == 1 or count > 4 then

      nameurl = url_encode(name)
      return "[//archive.org/search.php?query=mediatype%3A(texts)%20-contributor%3Agutenberg%20AND%20(subject%3A%22"..nameurl.."%22%20OR%20creator%3A%22"..nameurl.."%22%20) "..byabout.." "..name.."] "..tagline

    end

    -- Two words search string
    if count == 2 then

      return "[//archive.org/search.php?query=mediatype%3A(texts)%20-contributor%3Agutenberg%20AND%20(subject%3A%22"..exploded[2].."%2C%20"..exploded[1].."%2C%20"..birth.."-"..death.."%22%20OR%20creator%3A%22"..exploded[2].."%2C%20"..exploded[1].."%2C%20"..birth.."-"..death.."%22%20OR%20creator%3A%22"..exploded[1].."%20"..exploded[2].."%22%20OR%20title%3A%22"..exploded[1].."%20"..exploded[2].."%22%20OR%20description%3A%22"..exploded[1].."%20"..exploded[2].."%22) "..byabout.." "..name.."] "..tagline

    end

    -- Three words search string. This is at its near-maximum length for Internet Archive to handle.
    if count == 3 then

      firstinitial  = string.sub(exploded[1], 1, 1)
      middleinitial = string.sub(exploded[2], 1, 1)
     
      part1 = "[//archive.org/search.php?query=mediatype%3A(texts)%20-contributor%3Agutenberg%20AND%20(subject%3A%22"..exploded[3].."%2C%20"..exploded[1].."%20"..exploded[2].."%2C%20"..birth.."-"..death.."%22%20OR%20subject%3A%22"..exploded[3].."%2C%20"..firstinitial.."%2E%20"..middleinitial.."%2E%20%28"..exploded[1].."%20"..exploded[2].."%29%2C%20"..birth.."-"..death.."%22%20OR%20subject%3A%22"..exploded[3].."%2C%20"..exploded[1].."%20"..middleinitial.."%2E%20%28"..exploded[1].."%20"..exploded[2].."%29%2C%20"..birth.."-"..death.."%22%20OR%20subject%3A%22"..exploded[3].."%2C%20"..exploded[1].."%20"..exploded[2].."%22%20OR%20subject%3A%22"..exploded[3].."%2C%20"..firstinitial.."%2E%20"..middleinitial.."%2E%20%28"..exploded[1].."%20"..exploded[2].."%29%22%20OR%20subject%3A%22"..exploded[3].."%2C%20"..exploded[1].."%20"..middleinitial.."%2E%20%28"..exploded[1].."%20"..exploded[2].."%29%22%20OR%20subject%3A%22"..exploded[1].."%20"..exploded[2].."%20"..exploded[3].."%22%20OR%20subject%3A%22"..exploded[1].."%20"..middleinitial.."%2E%20"..exploded[3].."%22%20OR%20subject%3A%22"..firstinitial.."%2E%20"..middleinitial.."%2E%20"..exploded[3].."%22%20OR%20creator%3A%22"..exploded[3].."%2C%20"..exploded[1].."%20"..exploded[2].."%2C%20"..birth.."-"..death.."%22%20OR%20creator%3A%22"..exploded[3].."%2C%20"..exploded[1].."%20"..exploded[2].."%2C%20Sir%2C%20"..birth.."-"..death.."%22%20OR%20creator%3A%22"..exploded[3].."%2C%20"..firstinitial.."%2E%20"..middleinitial.."%2E%20%28"..exploded[1].."%20"..exploded[2].."%29%2C%20"..birth.."-"..death.."%22"

      part2 = "%20OR%20creator%3A%22"..exploded[3].."%2C%20"..exploded[1].."%20"..middleinitial.."%2E%20%28"..exploded[1].."%20"..exploded[2].."%29%2C%20"..birth.."-"..death.."%22%20OR%20creator%3A%22"..exploded[3].."%2C%20"..exploded[1].."%20"..exploded[2].."%22%20OR%20creator%3A%22"..exploded[3].."%2C%20"..firstinitial.."%2E%20"..middleinitial.."%2E%20%28"..exploded[1].."%20"..exploded[2].."%29%22%20OR%20creator%3A%22"..exploded[3].."%2C%20"..exploded[1].."%20"..middleinitial.."%2E%20%28"..exploded[1].."%20"..exploded[2].."%29%22%20OR%20creator%3A%22"..exploded[1].."%20"..exploded[2].."%20"..exploded[3].."%22%20OR%20creator%3A%22"..exploded[1].."%20"..middleinitial.."%2E%20"..exploded[3].."%22%20OR%20creator%3A%22"..firstinitial.."%2E%20"..middleinitial.."%2E%20"..exploded[3].."%22%20OR%20title%3A%22"..exploded[1].."%20"..exploded[2].."%20"..exploded[3].."%22%20OR%20title%3A%22"..exploded[1].."%20"..middleinitial.."%2E%20"..exploded[3].."%22%20OR%20title%3A%22"..firstinitial.."%2E%20"..middleinitial.."%2E%20"..exploded[3].."%22%20OR%20description%3A%22"..exploded[1].."%20"..exploded[2].."%20"..exploded[3].."%22%20OR%20description%3A%22"..exploded[1].."%20"..middleinitial.."%2E%20"..exploded[3].."%22%20OR%20description%3A%22"..firstinitial.."%2E%20"..middleinitial.."%2E%20"..exploded[3].."%22%20OR%20description%3A%22"..exploded[3].."%2C%20"..exploded[1].."%20"..exploded[2].."%22%20OR%20description%3A%22"..exploded[3].."%2C%20"..firstinitial.."%2E%20"..middleinitial.."%2E%20%28"..exploded[1].."%20"..exploded[2].."%29%22%20OR%20description%3A%22"..exploded[3].."%2C%20"..exploded[1].."%20"..middleinitial.."%2E%20%28"..exploded[1].."%20"..exploded[2].."%29%22) "..byabout.." "..name.."] "..tagline

      return part1 .. part2

    end

    -- Four words search string
    if count == 4 then

      return "[//archive.org/search.php?query=mediatype%3A(texts)%20-contributor%3Agutenberg%20AND%20(subject%3A%22"..exploded[4].."%2C%20"..exploded[1].."%20"..exploded[2].."%20"..exploded[3].."%2C%20"..birth.."-"..death.."%22%20OR%20creator%3A%22"..exploded[4].."%2C%20"..exploded[1].."%20"..exploded[2].."%20"..exploded[3].."%2C%20"..birth.."-"..death.."%22%20OR%20creator%3A"..exploded[1].."%20"..exploded[2].."%20"..exploded[3].."%20"..exploded[4]..") "..byabout.." "..name.."] "..tagline
    
    end

  end
  return "Unknown error (1). Please check documentation for [[Template:Internet Archive author]]"

end

--- URL-encode a string
--- http://lua-users.org/wiki/StringRecipes
---
function url_encode(str)
  if (str) then
    str = string.gsub (str, "\n", "\r\n")
    str = string.gsub (str, "([^%w %-%_%.%~])",
        function (c) return string.format ("%%%02X", string.byte(c)) end)
    str = string.gsub (str, " ", "+")
  end
  return str	
end

--- split a string into parts 
--- http://stackoverflow.com/questions/1426954/split-string-in-lua
---
function split(s, delimiter)
    result = {};
    for match in (s..delimiter):gmatch("(.-)"..delimiter) do
        table.insert(result, match);
    end
    return result;
end

return p