# -*- coding: utf-8 -*-
""" --- Corpus Clean - Stage 1
Copyright (c) 2008-2010 Gisle Ytrestol (gisley@ifi.uio.no)
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at
your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
The first stage of this script takes a folder of Wikipedia articles in the
original Wikipedia markup, and strips the article for unwanted markup.
The script is intended to be used together with Tokenizer v1.0,
http://www.cis.uni-muenchen.de/~wastl/misc/.
The output of this script is a single file which should be used as input
for Tokenizer v1.0 . The output from Tokenizer should be used by stage 2,
cccp.py. This second stage inserts proper sentence boundaries and allows
the user to specify whether the output should be one single file for each
corresponding input file, or whether the entire corpus should be dumped
into on single file/corpus.
Run python ccp.py -h for help!
"""
import os, re, regex, urllib,cccp
import sys,string
import codecs
from optparse import OptionParser
class AdjustName:
def addSlash(self,name):
if name[-1] != '/':
name = name+'/'
return name
else:
return name
def removeSlash(self,name):
if name[-1] == '/':
name = name[:-1]
return name
else:
return name
class WikiReader:
def readFile(self,file):
wholeFile = file.read()
return wholeFile
def listFiles(self,inputFolder):
fileList = os.listdir(inputFolder)
#print fileList
newFileList = []
for file in fileList:
if not file[0]== '.':
newFileList.append(file)
newFileList.sort()
#print newFileList
return newFileList
def readFirstLine(self,file):
firstLine = file.readline()
return firstLine
class WikiProcessor:
def __init__(self,redirect):
self.dictChecker = WikiDict()
self.redirect = redirect
if self.redirect == None:
self.redirect = 1
def tableCleaner(self,input):
input = regex.regtableConvertStart.sub(r'Ӂ',input)
input = regex.regtableConvertEnd.sub(r'\1ጣ',input)
while True:
if regex.regtableConverted.search(input):
input = regex.regtableConverted.sub(r'',input)
else:
break
input = regex.regtableConvertRevertStart.sub('{|',input)
input = regex.regtableConvertRevertEnd.sub('|}',input)
return input
def tableCleanerStephan(self,input):
#print input
while True:
if regex.regtableGisleImproved2.search(input):
input = regex.regtableGisleImproved2.sub(r'',input)
else:
break
return input
def tableCleaner2(self,input):
splitInput = input.split("\n")
keepInput = []
inTable = False
for inputLine in splitInput:
if inputLine.startswith("|") or inputLine.startswith("!") :
if inTable == True:
continue
regtablestart
if regex.regtablestart.search(inputLine):
#if inputLine.startswith("{|"):
inTable = True
continue
if inputLine.startswith("|}") and inTable == True:
inTable = False
continue
if inTable == True:
if not inputLine.startswith("|"):
if not inputLine.startswith("!"):
inTable = False
keepInput.append(inputLine)
return "\n".join(keepInput)
def tableCleaner2Reverse(self,input):
splitInput = input.split("\n")
splitInput.reverse()
keepInput = []
inTable = False
for inputLine in splitInput:
if inputLine.startswith("|") or inputLine.startswith("!") :
if inTable == True:
continue
if inputLine.startswith("|}"):
inTable = True
continue
if inputLine.startswith("{|") and inTable == True:
inTable = False
continue
if inTable == True:
if not inputLine.startswith("|"):
if not inputLine.startswith("!"):
inTable = False
keepInput.append(inputLine)
keepInput.reverse()
return "\n".join(keepInput)
#removes every line that starts with ! or | (normally tables)"
def tableCleaner3(self,input):
#print input
splitInput = input.split("\n")
keepInput = []
for inputLine in splitInput:
if inputLine.startswith("|") or inputLine.startswith("!") :
continue
keepInput.append(inputLine)
return "\n".join(keepInput)
def processFile(self,firstLine,wholeFile):
if self.redirect == 1:
try:
firstLine, wholeFile = self.redirectCheck(firstLine,wholeFile)
except:
sys.stdout.write("\nERROR WITH THE REDIRECT PROCESSING\nARE YOU SURE AN OFFLINE WIKIPEDIA READER IS ENABLED?\n\nTo run the script without redirect processing, use the -n option\n")
#sys.exit(0)
if self.dictChecker.checkIfIn(firstLine):
firstLine,wholeFile = self.cleanArticle(firstLine,wholeFile)
firstLine = self.addTitle(firstLine)
return firstLine,wholeFile
else:
return False, False
def cleanArticle(self,firstLine,wholeFile):
firstline = self.regCleanFile(firstLine)
wholeFile = self.regCleanFile(wholeFile)
wholeFile = self.removeEnd(wholeFile)
return firstLine,wholeFile
def addTitle(self,firstLine):
firstLine = ''+firstLine.rstrip()+''
return firstLine
def redirectCheck(self,firstLine,wholeFile): # if the article contains a redirect link, the redirect URL will be used to
if regex.regredirect.search(wholeFile): #retrive the correct article
title = regex.regredirect.sub(r'\1',wholeFile)
url = 'http://127.0.0.1:8000/article/'+title #address must correspond with local Wikipedia
url = re.sub(r' ','_',url)
#print url
page = urllib.urlopen(url)
page.close()
thisfile = '/var/tmp/result' #the wiki article is stored here when it is accessed
file = open(thisfile,'r')
firstLine = file.readline()
wholeFile = file.read()
return firstLine,wholeFile
else:
return firstLine,wholeFile
def addNewline(self,input):
input = regex.regeos.sub('\n',input)
return input
def removeEnd(self,input):
input = regex.regsourcelookahead.sub(r'___',input)
input = regex.regsourcelookahead.sub(r'___',input)
input = regex.regsourcelookahead.sub(r'___',input)
input = regex.regseealsolookahead.sub(r'___',input)
input = regex.regseealsolookahead.sub(r'___',input)
input = regex.regseealsolookahead.sub(r'___',input)
input = regex.regnoteslookahead.sub(r'___',input)
input = regex.regnoteslookahead.sub(r'___',input)
input = regex.regnoteslookahead.sub(r'___',input)
input = regex.regreflookahead.sub(r'___',input)
input = regex.regreflookahead.sub(r'___',input)
input = regex.regreflookahead.sub(r'___',input)
input = regex.regsealso.sub('',input)
input = regex.regnotes.sub('',input)
input = regex.regreferences.sub('',input)
input = regex.regsources.sub('',input)
input = regex.regsourcelookaheadrestore.sub(r'\2\1\2',input)
input = regex.regseealsolookaheadrestore.sub(r'\2\1\2',input)
input = regex.regnoteslookaheadrestore.sub(r'\2\1\2',input)
input = regex.regreflookaheadrestore.sub(r'\2\1\2',input)
input = regex.regbibliography.sub('',input)
input = regex.regfootnotes.sub('',input)
input = regex.regrelated.sub('',input)
input = regex.regexternal.sub('',input)
return input
def regCleanFile(self,input):
input = regex.regipa.sub(r'<___\1___>',input)
input = regex.regjava.sub(r'<___\1___>',input)
input = regex.regiast.sub(r'<___\1___>',input)
## japanese article templates
#input = regex.regtransjap.sub(r'\1',input)
#input = regex.reglangjap.sub(r'\1',input)
input = regex.reglanggeneralpreserve.sub(r'<___\1___>',input)
input = regex.regtransgeneralpreserve.sub(r'<___\1___>',input)
input = regex.regnihongopreservere.sub(r'<___\1___>',input)
input = regex.regharv_general.sub(r'<___\1___>',input)
input = regex.regaudio_general.sub(r'<___\1___>',input)
input = regex.regflagtemplate.sub(r'<___\1___>',input)
"""
IF WE WANT TO EXPAND THE TEMPLATES, USE THESE!
input = regex.regharv_aut_aut_year_page.sub(r'(\1 & \2 \3, \4)',input)
input = regex.regharv_aut_year_page.sub(r'(\1 2, \3)',input)
input = regex.regharvtxt_aut_year_page.sub(r'\1 (\2, \3)',input)
input = regex.regharvtxt_aut_year.sub(r'\1 (\2)',input)
input = regex.regharvtxt_aut_aut_year_page.sub(r'\1 & \2 (\3, \4)',input)
input = regex.regharvtxt_aut_aut_year.sub(r'\1 & \2 (\3)',input)
input = regex.regharvnb_aut_year_page_nb.sub(r'\1 \2, \3',input)
input = regex.regharvnb_aut_year_nb.sub(r'\1 \2',input)
input = regex.regharvnb_aut_aut_year_page_nb.sub(r'\1 & \2 \3, \4',input)
#input = regex.regharvnb_aut_aut_year_page_nb.sub(r'\1 & \2 \3, \4',input)
input = regex.regharvnb_aut_aut_aut_year_page_nb.sub(r'\1, \2 & \3 \4, \5',input)
input = regex.regharvnb_aut_aut_year_nb.sub(r'\1 & \2 \3',input)
input = regex.regharvnb_aut_aut_aut_year_nb.sub(r'\1, \2 & \3 \4',input)
input = regex.regharvcoltxt_aut_year_page.sub(r'\1 (\2:\3)',input)
"""
"""
TO EXPAND TEMPLATES, USE THESE!
#input = regex.regtransgeneral.sub(r'\1',input)
#input = regex.reglanggeneral.sub(r'\1',input)
input = regex.regnihongohardcode.sub(r'\1)',input)
input = regex.regnihongojap5.sub(r'\1 (\2 \3 \4 \5)',input)
input = regex.regnihongojap4.sub(r'\1 (\2 \3 \4)',input)
input = regex.regnihongojap3.sub(r'\1 (\2 \3)',input)
input = regex.regnihongojap2.sub(r'\1 (\2)',input)
"""
#while regex.regcurly1.search(input):
# input = regex.regcurly1.sub('',input)
while regex.reglongTemp.search(input):
input = regex.reglongTemp.sub('',input)
while regex.regboxtable.search(input):
input = regex.regboxtable.sub('',input)
"""
input = regex.regcurly1.sub('',input)
input = regex.regcurly1.sub('',input)
input = regex.regcurly1.sub('',input)
input = regex.regcurly1.sub('',input)
input = regex.regcurly1.sub('',input)
"""
input = regex.regblockquote.sub(r' \1
',input)
input = regex.regdiv2.sub('',input)
input = regex.reggallery.sub('',input)
input = regex.regimage.sub(r'',input)
input = regex.regimage.sub(r'',input)
input = regex.regref2.sub('',input)
input = regex.regref.sub('',input)
input = regex.regcomment.sub('',input)
input = regex.regsingleast.sub('',input)
input = regex.regdeflist.sub(r'\1',input)
#input = self.tableCleaner(input)
input = self.tableCleanerStephan(input)
input = self.tableCleaner2(input)
input = self.tableCleaner2Reverse(input)
input = regex.regwikitable3.sub('',input)
"""
INCLUDED IF NOT NOW oct 8
input = regex.regwikitable2.sub('',input)
input = regex.regwikitable2.sub('',input)
input = regex.regwikitable2.sub('',input)
input = regex.regwikitable2.sub('',input)
input = regex.regwikitable2.sub('',input)
input = regex.regwikitable3.sub('',input)
#new
"""
input = regex.regtable.sub('',input)
input = regex.regtableborder.sub('',input)
input = regex.regtablehardcode.sub('',input)
input = regex.regcategory.sub('',input)
#input = self.tableCleaner3(input)
input = regex.regbacktocurly1.sub('{{',input)
input = regex.regbacktocurly2.sub('}}',input)
input = regex.regsentinitialbracket.sub(r'\1\2',input)
input = regex.regbracket.sub(r'\1', input)
input = regex.regbullets.sub(r'\1', input)
input = regex.regbullets2.sub(r'\1', input)
input = regex.regindentcolon.sub(r'\1', input)
input = regex.regbulletscolon.sub(r'\2', input)
input = regex.regbr.sub(r'',input)
input = regex.regtitle.sub(r'\1',input)
input = regex.regparagraph.sub('',input)
input = regex.regyeareos.sub(r'\1',input)
input = regex.regorg.sub(r'\1',input)
input = regex.reghyphen.sub('',input)
#remove no wiki
#input = regex.regremovenowiki.sub('',input)
input = self.removeLines(input)
input = regex.regremovenewline.sub(' ',input)
return input
def removeTableLeftover(self,input):
splitInput = input.split('\n')
newArticle = ''
for line in splitInput:
if not regex.regletternumber.search(line):
continue
if not "ARTICLE>" in line:
if regex.regonlyXML.match(line):
continue
if line.startswith("|") or line.startswith("!") or line.startswith("{|") or line.startswith("|}"):
continue
else:
newArticle = newArticle+str(line)+'\n'
return newArticle.rstrip()
def removeLines(self,line):
splitLine = line.split("\n")
keepLine = []
for line in splitLine:
if regex.regletternumber.search(line):
keepLine.append(line)
return "\n".join(keepLine)
class WikiDict:
def __init__(self,wikiDict=None):
self.wikiDict = {}
def checkIfIn(self,firstLine):
if firstLine in self.wikiDict:
return False
else:
self.wikiDict[firstLine] = ""
return True
class WikiWriter:
def __init__(self,output):
self.outFile = open(output,'w')
def writeFile(self,firstLine,wholeFile):
if wholeFile:
#print "Writing "+ firstLine
self.outFile.write(''+firstLine+'')
self.outFile.write(wholeFile)
self.outFile.write('\n\n')
def closeFile(self):
self.outFile.close()
def checkSyntax(input,output):
if output == None or input == None:
return False
if os.path.isdir(input) and not os.path.isdir(output):
return True
else:
return False
def main():
parser = OptionParser()
parser.add_option("-i", "--input", dest="input",
help="Input folder where Wikipedia Source files are stored", metavar="Input Folder")
parser.add_option("-o", "--output", dest="output",
help="Output file where cleansed Wikipedia Source files will be stored", metavar="Output File")
parser.add_option("-n", "--noredirects", dest="redirects", action ="store_false",
help="No connection to local Wikipedia Reader, therefore no redirect processing.",
metavar="redirects")
options, args = parser.parse_args()
input = options.input
output = options.output
redirect = options.redirects
if checkSyntax(input,output):
pass
else:
sys.stdout.write("\nERROR WITH THE INPUT/OUTPUT FILES\n")
sys.exit(0)
wikiReader = WikiReader()
wikiProcess = WikiProcessor(redirect)
wikiWriter = WikiWriter(output)
adjustName = AdjustName()
input = adjustName.removeSlash(input)
fileList = wikiReader.listFiles(input)
for file in fileList:
fileName = input+'/'+file
if os.path.isfile(fileName):
#test utf-8
#codecs.open( "someFile", "r", "utf-8" )
#fileObject = open(input+'/'+file,'r','utf-8')
fileObject = open(input+'/'+file,'r')
firstLine = wikiReader.readFirstLine(fileObject)
wholeFile = wikiReader.readFile(fileObject)
#print fileName
#print len(wholeFile)
firstLine, wholeFile = wikiProcess.processFile(firstLine,wholeFile)
if firstLine: #returns false if article already in
wikiWriter.writeFile(firstLine,wholeFile)
fileObject.close()
wikiWriter.closeFile()
if __name__ == '__main__':
main()