User:Disambot/Source

The Disambot source code is divided into three scripts:

enwp.py provides the framework for interfacing with the English Wikipedia. It uses a combination of API calls and regular HTTP requests.
disambot.py extracts a list of disambiguation pages (or more precisely, their titles) from working list.txt and puts each one through an inspection function which loads the page content, makes various changes, and saves any changes.
private.py stores the username and password of the bot account.

These scripts are shown below:

enwp.py[edit]

 import urllib, urllib2, ClientCookie, time
 
 
 debug_mode = False
 <nowiki>base_url = 'http://en.wikipedia.org/'</nowiki>
 api_url = base_url + 'w/api.php'
 
 
 def login(username, password):
 	url = globals()['api_url']
 	data = {
 		'action'     : 'login',
 		'lgname'     : username,
 		'lgpassword' : password,
 		'format'     : 'xml'
 	}
 	
 	if globals()['debug_mode']: print 'Logging in...'
 	response = ClientCookie.urlopen(url, urllib.urlencode(data)).read()
 	if globals()['debug_mode']: print 'Done'
 
 
 def grab_page(title, render=False, expand_templates=False):
 	if render: ren_param = '&action=render'
 	else:      ren_param = '&action=raw'
 	if expand_templates: expand_param = '&templates=expand'
 	else:                expand_param = ''
 	
 	url = globals()['base_url'] + 'w/index.php?title=' + title.replace(' ', '_') + ren_param + expand_param
 	if globals()['debug_mode']: print 'Fetching ' + url
 	
 	response = ClientCookie.urlopen(url).read()
 	if globals()['debug_mode']: print str(len(response)) + ' bytes received'
 	
 	return response
 	
 
 def edit_page(title, new_content, summary=''):
 	# First, obtain the required editing token and the timestamp of the last page edit
 	url = globals()['api_url']
 	data = {
 		'action'  : 'query',
 		'prop'    : 'info|revisions',
 		'intoken' : 'edit',
 		'titles'  : title,
 		'format'  : 'xml'
 	}
 	if globals()['debug_mode']: print 'Fetching ' + url
 	response = ClientCookie.urlopen(url, urllib.urlencode(data)).read()
 	if globals()['debug_mode']: print str(len(response)) + ' bytes received'
 	
 	# Grab the supplied token from the XML-formatted response
 	token_start = response.find('edittoken="') + len('edittoken="')
 	token_end   = response.find('"', token_start)
 	token = response[token_start : token_end]
 	if globals()['debug_mode']: print 'Token: ' + token
 	
 	# Grab the last revision timestamp as well
 	ts_start = response.find('timestamp="') + len('edittoken="')
 	ts_end   = response.find('"', ts_start)
 	ts = response[ts_start : ts_end]
 	if globals()['debug_mode']: print 'Base timestamp: ' + ts
 	
 	# We just fetched a (last edit) timestamp of the form 2008-06-18T07:18:06Z; convert it to 20080618071806
 	edit_time = ts[0:4] + ts[5:7] + ts[8:10] + ts[11:13] + ts[14:16] + ts[17:19]
 	if globals()['debug_mode']: print 'Time of last edit: ' + str(edit_time)
 	
 	# Get the current time and convert it to the 20080618071806 format as well
 	ct = time.gmtime()[0:6] # tuple of the form (year, month, day, hour, minute, second)
 	start_time = str(ct[0]).zfill(4) + str(ct[1]).zfill(2) + str(ct[2]).zfill(2) + str(ct[3]).zfill(2) + str(ct[4]).zfill(2) + str(ct[5]).zfill(2)
 	if globals()['debug_mode']: print 'Time of token retreival: ' + str(start_time)
 	
 	# Next, use the API to push the new page content
 	'''
 	data = {
 		'action'        : 'edit',
 		'title'         : title,
 		'section'       : 0,
 		'text'          : new_content,
 		'token'         : token,
 		'summary'       : summary,
 		'bot'           : True,
 		'basetimestamp' : ts,
 		'nocreate'      : True,
 		'format'        : 'xml'
 	}
 	'''
 	url = globals()['base_url'] + 'w/index.php?' + urllib.urlencode({ 'title':title, 'action':'submit' }, True)
 	data = {
 		'wpAntispam'    : '',
 		'wpSection'     : '',
 		'wpStarttime'   : start_time,
 		'wpEdittime'    : edit_time,
 		'wpScrolltop'   : 0, # WTF does this do?
 		'wpTextbox1'    : new_content,
 		'wpSummary'     : summary,
 		'wpAutoSummary' : 'd41d8cd98f00b204e9800998ecf8427e', # not sure how this works
 		'wpSave'        : 'Save page',
 		'wpEditToken'   : token
 	}
 	data = urllib.urlencode(data)
 	req = urllib2.Request(url, data, { 'User-Agent' : 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9) Gecko/2008060309 Firefox/3.0' }, True)
 	
 	if globals()['debug_mode']: print 'Sending data to ' + url
 	try:
 		response = ClientCookie.urlopen(req).read()
 	except urllib2.HTTPError, response:
 		if globals()['debug_mode']: print 'HTTP error encountered...'
 	except AttributeError: pass # seems to be a small of bug in ClientCookie
 	if globals()['debug_mode']: globals()['response'] = response
 	
 	'''
 	result_start = response.find('result="') + len('result="')
 	result_end   = response.find('"', result_start)
 	result = response[result_start : result_end]
 	if globals()['debug_mode']: print 'Result: ' + result
 	
 	if result.lower() is 'failure':
 		return False
 	'''
 	
 	return True
 
 def sandbox_test():
 	edit_page('Wikipedia:Sandbox', 'Hello! This is a sandbox edit done using a [[Python (programming language)|Python]] script.')

disambot.py[edit]

 import enwp, private
 
 abbreviations = ( 'ac.', 'Co.', 'Corp.', 'deg.', 'ft.', 'Inc.', 'kg.', 'km.' 'mi.', 'mo.', 'oz.', 'qr.', 'qt.', 'yd.' )
 
 # Log in to en-wp account
 enwp.login(private.username, private.password)
 
 
 def inspect(title):
 	print 'Inspecting ' + title + '...'
 	
 	# Defaults
 	changed = False
 	complex_errors = ()
 	
 	article_body = enwp.grab_page(title).strip()
 	article_body_orig = article_body
 	
 	raw_html = enwp.grab_page(title, True)
 	
 	# Skip set indices
 	if article_body.lower().find('[[category:set indices') is not -1:
 		return false
 	
 	lines = article_body.splitlines()
 	
 	# Main loop -- cycle through lines
 	for i, line in enumerate(lines):
 		# Skip short/empty lines
 		if len(line) < 5:
 			continue
 		
 		# Strip extra whitespace
 		line = line.strip()
 		line_orig = line
 		
 		# Replace ordered list items with unordered list items
 		if line[0] is '#':
 			line = '*' + line[1:]
 		
 		# Handle list items
 		if line[0] is '*': # if this line is a list item
 			# Fix punctuation at the end
 			if line[-1] is '.' or line[-1] is ',' or line[-1] is ';': # if there is punctuation at the end
 				if line.count('.') >= 2 and line[line.find('.')+1] == ' ' and line[line.find('.')+2] is line[line.find('.')+2].upper(): # if multiple sentences
 					complex_errors += ('item with multiple sentences detected (line '+str(i)+')',)
 				else:
 					# Remove the punctuation, unless it's a proper abbreviation
 					abbrev = False
 					for a in globals()['abbreviations']:
 						if ' '+a.lower() is line[-1*(len(a)+1):].lower(): # if this abbreviation is at the end of the line
 							abbrev = True
 							break;
 					if not abbrev and line[-2] is line[-2].lower(): # not an abbreviation and not an acronym
 						line = line[0:-1] # remove punctuation (last character)
 			
 			# Remove any bullets to assess the item itself
 			line_content = line
 			while line_content[0] is '*':
 				line_content = line_content[1:].strip()
 			line_content_orig = line_content
 			
 			# Remove outer boldness if necessary
 			if line_content[0:3] is "'''":
 				count = 0
 				while line_content[0] is "'":
 					line_content = line_content[1:]
 					count += 1
 				if count is 3 and line_content[count:count+2] is '[[':
 					line_content.replace("'"*count, '', 1)
 			
 			# Correct piped links
 			<nowiki>if line.find('|') is not -1 and line_content.find('[[') is 0 and line.find(']]') is not -1 and line.find('|') < line.find(']]'):</nowiki>
 				# There is a piped link at the beginning of this line -- remove it
 				# Get rid of pipe, checking for italics
 				p1 = line_content.find('|')
 				p2 = line_content.find(']]')
 				p3 = line_content.find("''", p1, p2)
 				if p3 is not -1 and line_content[p3+2] is not "'": # there are italics inside pipe
 					pass ####
 					#p4 = line_content.find("''", p3+2) # closing ''
 					#if p4 is -1:
 						#complex_errors += ('italicized text seems misformatted (line '+str(i)+')',)
 					#else:
 						#italicized = line_content[p3+2:p4]
 				else: # no italics --> simply remove pipe
 					line_content = line_content[:p1] + line_content[p2:]
 			
 			# Check for wikilinks that are not the first word
 			if line_content.find('[[', 3) is not -1:
 				p1 = line_content.find('[[')
 				p2 = line_content.find('|')
 				p3 = line_content.find(']]')
 				if p2 is -1:
 					article_title = line_content[p1+2:p3]
 				else:
 					article_title = line_content[p2+1:p3]
 				p4 = raw_html.find(article_title+' (page does not exist)')
 				if (p1 is 0 or p1 is 2) and p4 is -1:
 					# The first word is wikilinked as it should be and not a red link, but there are other links that shouldn't be here
 					firstlink_end = line_content.find(']]')
 					if firstlink_end is -1:
 						# No closing "]]" ... something must be screwy
 						complex_errors += ('error in wikilink syntax (line '+str(i)+')',)
 					else:
 						firstlink_end += 2 # skip the ]]
 						<nowiki>while line_content.find('[[', firstlink_end) is not -1 and line_content.find(']]', firstlink_end) is not -1:</nowiki> # links remain
 							link_start = line_content.find('[[', firstlink_end)
 							link_pipe  = line_content.find('|' , firstlink_end)
 							link_end   = line_content.find(']]', firstlink_end)
 							
 							if link_start > link_end:
 								complex_errors += ('error in wikilink syntax (line '+str(i)+')',)
 								break
 							
 							new = line_content[:link_start]
 							if link_pipe is -1 or link_pipe > link_end: # no pipe in link of interest
 								new += line_content[link_start+2:link_end] + line_content[link_end+2:]
 							else: # there is a pipe in link of interest
 								new += line_content[link_pipe+1:link_end] + line_content[link_end+2:]
 							line_content = new # update
 				else:
 					# There are inappropriate wikilinks, but if we remove them we'll be left with no links. Human review needed.
 					complex_errors += ('item contains link, but not in the proper place (line '+str(i)+')',)
 			
 			# Update the line without screwing with its spacing
 			line = line[:len(line)-len(line_content_orig)] + line_content
 		
 		# Replace old version of this line with new one if we've changed anything
 		if line is not line_orig:
 			lines[i] = line
 			changed = True
 	
 	# Implode lines back into one big string
 	article_body = "\n".join(lines)
 	
 	# Check for external links
 	links = article_body.count('[http')
 	if links > 0:
 		complex_errors += ('contains '+str(links)+' external link'+('s'*(links!=1)),)
 	
 	# Finish up
 	if lines is not article_body_orig.splitlines(False):
 		# Update the article
 		print "\tMaking changes..."
 		<nowiki>enwp.edit_page(title, article_body, 'Cleaning up disambiguation page in accordance with [[Wikipedia:Manual of Style (disambiguation pages)]]')</nowiki>
 	if len(complex_errors) > 0:
 		# Add the article to list of potential atrocities, along with notes, unless it's already there
 		atrocities = enwp.grab_page('User:Disambot/Potential atrocities')
 		<nowiki>if atrocities.find("[[" + title + "]]") == -1: # if not already listed</nowiki>
 			<nowiki>atrocities += "\n\n[[" + title + "]]"</nowiki>
 			for this in complex_errors:
 				atrocities += "\n* " + this
 			print "\tListing on potential atrocities..."
 			<nowiki>enwp.edit_page('User:Disambot/Potential atrocities', atrocities, 'Adding [['+title+']]')</nowiki>
 
 
 def go():
 	article_list = open('working list', 'r')
 	for title in article_list: inspect(title.strip())
 	article_list.close()

private.py[edit]

 username = '(not shown)'
 password = '(not shown)'