package mpqareader;
import java.io.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class PatchCorpus {
public static void main(String[] argv) {
if(argv.length < 1) {
System.err.println("Must specify -strip, -patch, or -check.");
System.exit(1);
}
if(argv[0].equals("-strip")) {
strip(argv);
} else if(argv[0].equals("-patch")) {
patch(argv);
} else if(argv[0].equals("-check")) {
check(argv);
} else {
System.err.println("Unknown mode: " + argv[0]);
System.exit(1);
}
}
private static void patch(String[] argv) {
if(argv.length < 4) {
System.err.println("Must give 3 arguments: directory of stripped "
+ "files, MPQA document directory, and output directory.");
System.exit(1);
}
try {
File strippedDir = new File(argv[1]);
String mpqaDir = argv[2];
String outDir = argv[3];
if(!new File(outDir).exists()) {
System.err.println("Output directory does not exist.");
if(!new File(outDir).mkdir()) {
System.err.println("Could not create output directory.");
System.exit(1);
} else
System.err.println("Created output directory.");
}
int count = 0;
char[] buf = new char[BUF_SIZE];
for(File f: strippedDir.listFiles()) {
if(f.getName().endsWith("lktext.xml")) {
StringBuilder sb = new StringBuilder();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
while(true) {
int n = br.read(buf);
if(n == -1)
break;
sb.append(new String(buf, 0, n));
}
br.close();
int six = sb.indexOf("___STRIPPED___");
int fn_six = sb.indexOf("database.mpqa.2.0/docs/") + "database.mpqa.2.0/docs/".length();
int fn_eix = sb.indexOf("")) {
Matcher m = TOKEN_PAT.matcher(line);
if(!m.find())
throw new RuntimeException("line = " + line);
String id = m.group(1);
int start = Integer.parseInt(m.group(2));
int end = Integer.parseInt(m.group(3));
String encToken = m.group(4);
String token;
if(encToken.equals("___STRIPPED___"))
token = text.substring(start, end + 1);
else
token = encToken;
pw.println(" " + token + "");
line = br.readLine();
}
while(line != null) {
pw.println(line);
line = br.readLine();
}
br.close();
pw.close();
} else if(!f.getName().endsWith("lktext.xml")) {
StringBuilder sb = new StringBuilder();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
while(true) {
int n = br.read(buf);
if(n == -1)
break;
sb.append(new String(buf, 0, n));
}
br.close();
String outFileName = outDir + File.separatorChar + f.getName();
PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outFileName), "UTF-8"));
pw.print(sb);
pw.close();
}
}
System.out.println("Patched " + count + " files.");
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
}
private static String encodeXML(String s) {
s = s.replaceAll("&", "&");
s = s.replaceAll("\"", """);
s = s.replaceAll("<", "<");
s = s.replaceAll(">", ">");
String c26 = "" + (char) 26;
s = s.replaceAll(c26, "");
return s;
}
private static String decodeXML(String s) {
s = s.replaceAll(""", "\"");
s = s.replaceAll("<", "<");
s = s.replaceAll(">", ">");
s = s.replaceAll("&", "&");
return s;
}
private static int BUF_SIZE = 10000;
private static final Pattern TOKEN_PAT = Pattern.compile("id=\"(.*?)\" start=\"#(.*?)\" end=\"#(.*?)\".*?>(.*?)") + "".length();
int eix = sb.lastIndexOf("");
sb.delete(six, eix);
sb.insert(six, "___STRIPPED___");
String outFileName = outDir + File.separatorChar + f.getName();
PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outFileName), "UTF-8"));
pw.print(sb);
pw.close();
count++;
} else if(f.getName().endsWith("tokens.xml")) {
String outFileName = outDir + File.separatorChar + f.getName();
PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outFileName), "UTF-8"));
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
String line = br.readLine();
while(line != null) {
pw.println(line);
if(line.contains("")) {
Matcher m = TOKEN_PAT.matcher(line);
if(!m.find())
throw new RuntimeException("line = " + line);
String id = m.group(1);
int start = Integer.parseInt(m.group(2));
int end = Integer.parseInt(m.group(3));
String token = m.group(4);
String ttext = text.substring(start, end + 1);
String encToken;
if(token.equals(ttext))
encToken = "___STRIPPED___";
else
encToken = token;
pw.println(" " + encToken + "");
line = br.readLine();
}
while(line != null) {
pw.println(line);
line = br.readLine();
}
pw.close();
} else {
StringBuilder sb = new StringBuilder();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
while(true) {
int n = br.read(buf);
if(n == -1)
break;
sb.append(new String(buf, 0, n));
}
String outFileName = outDir + File.separatorChar + f.getName();
PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outFileName), "UTF-8"));
pw.print(sb);
pw.close();
}
}
System.out.println("Stripped " + count + " files.");
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
}
private static String readText(String file, char[] buf) throws IOException {
StringBuilder sb = new StringBuilder();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
while(true) {
int n = br.read(buf);
if(n == -1)
break;
sb.append(new String(buf, 0, n));
}
int six = sb.indexOf("") + "".length();
int eix = sb.lastIndexOf("");
String text = sb.substring(six, eix);
text = decodeXML(text);
return text;
}
private static void check(String[] argv) {
try {
File dir = new File(argv[1]);
String outDir = argv[2];
char[] buf = new char[BUF_SIZE];
int count = 0;
for(File f: dir.listFiles()) {
count++;
if(count % 50 == 0) {
System.out.print(".");
System.out.flush();
}
StringBuilder sb = new StringBuilder();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
while(true) {
int n = br.read(buf);
if(n == -1)
break;
sb.append(new String(buf, 0, n));
}
String s1 = sb.toString();
s1 = s1.replaceAll("\n\r", "\r");
String fileName2 = outDir + File.separatorChar + f.getName();
StringBuilder sb2 = new StringBuilder();
BufferedReader br2 = new BufferedReader(new InputStreamReader(new FileInputStream(fileName2), "UTF-8"));
while(true) {
int n = br2.read(buf);
if(n == -1)
break;
sb2.append(new String(buf, 0, n));
}
String s2 = sb.toString();
s2 = s2.replaceAll("\n\r", "\r");
if(!s1.equals(s2)) {
System.out.println(f.getName() + " and " + fileName2 + " differ.");
System.out.println(s1.length());
System.out.println(s2.length());
for(int i = 0; i < s1.length(); i++) {
if(s1.charAt(i) != s2.charAt(i))
System.out.print("X");
char c1 = s1.charAt(i);
char c2 = s2.charAt(i);
String sc1;
if(c1 == 13)
sc1 = "\\n";
else if(c1 == 10)
sc1 = "\\r";
else if(c1 < 32)
sc1 = "(" + (int) c1 + ")";
else
sc1 = "" + c1;
String sc2;
if(c2 == 13)
sc2 = "\\n";
else if(c2 == 10)
sc2 = "\\r";
else if(c2 < 32)
sc2 = "(" + (int) c2 + ")";
else
sc2 = "" + c2;
System.out.println(i + "\t" + sc1 + "(" + (int) c1 + ")\t" + sc2 + "(" + (int) c2 + ")");
}
break;
}
}
System.out.println();
System.out.println("Checked " + count + " files.");
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
}
}