package lkformat2;
import java.io.*;
import java.util.*;
//import java.util.regex.*;
public class SSTToLK {
private static final String ENCODING = "UTF-8";
public static void main(String[] argv) {
String lkDir = argv[0];
String sstOutputFile = argv[1];
String conll2008File = argv[2];
String outDir = argv[3];
System.out.println("argv = " + Arrays.toString(argv));
try {
BufferedReader sstInput = new BufferedReader(new InputStreamReader(new FileInputStream(sstOutputFile), ENCODING));
PrintWriter conll2008Out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(conll2008File), ENCODING));
String[] files = new File(lkDir).list();
Arrays.sort(files);
for(String file: files) {
if(file.endsWith("lktext.xml"))
processFile(lkDir + File.separatorChar + file, sstInput,
conll2008Out, outDir);
}
conll2008Out.close();
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
}
public static void processFile(String textFile, BufferedReader sstInput,
PrintWriter tabularCoNLL08Out, String outDir) {
try {
textFile = textFile.replaceAll("[^/]+/", "");
System.out.println("textFile = " + textFile);
String line = sstInput.readLine();
if(line == null || !line.startsWith("___BEGIN___"))
throw new RuntimeException("Excpected beginning of file...");
String tokenFile = line.substring("___BEGIN___|".length());
tokenFile = tokenFile.replaceAll(" .*", "");
tabularCoNLL08Out.print("0\t___BEGIN___|" + tokenFile + "\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("___BEGIN___|" + tokenFile + "\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("0\t");
tabularCoNLL08Out.println("ROOT");
tabularCoNLL08Out.println();
tokenFile = tokenFile.replaceAll("[^/]+/", "");
System.out.println(tokenFile);
String basename = textFile.replaceFirst("\\.lktext\\.xml", "");
basename = basename.replaceAll("[^/]+/", "");
basename = outDir + File.separatorChar + basename;
String outPOSFile = basename + ".pos.xml";
String outSSTFile = basename + ".sst.xml";
PrintWriter posOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outPOSFile), ENCODING));
PrintWriter sstOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outSSTFile), ENCODING));
posOut.println("");
posOut.println("");
posOut.println("");
posOut.println(" " + textFile + "");
posOut.println(" SSTLight");
posOut.println("");
posOut.println("");
sstOut.println("");
sstOut.println("");
sstOut.println("");
sstOut.println(" " + textFile + "");
sstOut.println(" SSTLight");
sstOut.println("");
ArrayList lemmas = new ArrayList();
ArrayList ssIOB = new ArrayList();
ArrayList conll03IOB = new ArrayList();
ArrayList wsjIOB = new ArrayList();
int tokenIdCounter = 0;
line = sstInput.readLine();
while(line != null && !line.contains("___END___")) {
line = line.trim();
if(!line.equals("")) {
String[] ss = line.split(" ");
if(ss.length % 6 != 0)
throw new RuntimeException("this line: |" + line + "|");
int n = ss.length / 6;
for(int i = 0; i < n; i++) {
String token = ss[6*i];
String pos = ss[6*i + 1];
String lemma = ss[6*i + 2];
lemmas.add(lemma);
tabularCoNLL08Out.print((i + 1) + "\t");
tabularCoNLL08Out.print(token + "\t");
tabularCoNLL08Out.print(lemma + "\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print(pos + "\t");
tabularCoNLL08Out.print(token + "\t");
tabularCoNLL08Out.print(lemma + "\t");
tabularCoNLL08Out.print(pos + "\t");
tabularCoNLL08Out.print("0\t");
tabularCoNLL08Out.println("ROOT");
if(i == 0)
for(int j = 3; j < 6; j++)
if(ss[6*i + j].startsWith("I-"))
ss[6*i + j] = "B-" + ss[6*i + j].substring(2);
ssIOB.add(ss[6*i + 3]);
conll03IOB.add(ss[6*i + 4]);
wsjIOB.add(ss[6*i + 5]);
int id = ++tokenIdCounter;
printEntity(pos, id, -1, id, posOut);
}
tabularCoNLL08Out.println();
}
line = sstInput.readLine();
}
int nTokens = tokenIdCounter;
posOut.println("");
posOut.println("");
for(int i = 0; i < lemmas.size(); i++) {
String lemma = lemmas.get(i);
int tid = i + 1;
int id = tid + nTokens;
printEntity(lemma, tid, -1, id, posOut);
}
posOut.println("");
posOut.println("");
posOut.close();
String preamble = "";
int ssid = 0;
ssid = printIOB(sstOut, preamble + "WNSS" + endString, ssid, nTokens, ssIOB);
ssid = printIOB(sstOut, preamble + "NE-CONLL03" + endString, ssid, nTokens, conll03IOB);
ssid = printIOB(sstOut, preamble + "NE-WSJ" + endString, ssid, nTokens, wsjIOB);
sstOut.println("");
sstOut.close();
tabularCoNLL08Out.print("0\t___END___\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("___END___\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print("0\t");
tabularCoNLL08Out.println("ROOT");
tabularCoNLL08Out.println();
posOut.close();
sstOut.close();
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
}
static void printEntity(String l, int start, int end,
int id, PrintWriter out) {
StringBuilder sb = new StringBuilder(" ");
else
sb.append("\" on=\"#" + start + "\">");
sb.append(l);
sb.append("");
out.println(sb);
}
private static int printIOB(PrintWriter sstOut, String preamble,
int ssid, int nTokens, ArrayList iob) {
sstOut.println(preamble);
String openTag = null;
int openTagStart = -1;
for(int i = 0; i < iob.size(); i++) {
String tag = iob.get(i);
if(!tag.equals("0") && !tag.startsWith("B-") && !tag.startsWith("I-"))
throw new RuntimeException("Illegal tag " + tag);
int tid = i + 1;
if(openTag != null) {
String t = tag.equals("0")? null: tag.substring(2);
if(t == null || tag.startsWith("B") || !t.equals(openTag)) {
int id = ++ssid;
printEntity(openTag, openTagStart, tid-1, id, sstOut);
openTag = null;
}
}
if(!tag.equals("0")) {
if(openTag == null) {
openTag = tag.substring(2);
openTagStart = tid;
} else {
if(!tag.substring(2).equals(openTag))
throw new RuntimeException("Illegal tag here");
}
}
//printEntity(lemma, tid, -1, id, posOut);
}
if(openTag != null) {
int id = ++ssid;
printEntity(openTag, openTagStart, nTokens, id, sstOut);
openTag = null;
}
sstOut.println("");
return ssid;
}
}