package lkformat2; import java.io.*; import java.util.*; import java.util.regex.*; import se.lth.cs.nlp.depsrl.format.*; import se.lth.cs.nlp.nlputils.depgraph.*; import se.lth.cs.nlp.nlputils.core.*; import gnu.trove.*; public class CoNLL2008ToLK { private static final String ENCODING = "UTF-8"; public static void main(String[] argv) { String lkTextFile = argv[0]; String lkTokenFile = argv[1]; String conll2008File = argv[2]; String basename = lkTokenFile.replaceAll("\\.tokens\\.xml", ""); String depOutFile = basename + ".depsyntax.xml"; String paOutFile = basename + ".predargs.xml"; try { BufferedReader tInput = new BufferedReader(new FileReader(lkTokenFile)); Scanner conllInput = new Scanner(new FileReader(conll2008File)); PrintWriter depOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(depOutFile), ENCODING)); PrintWriter paOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(paOutFile), ENCODING)); TIntArrayList ids = new TIntArrayList(); Pattern pattern = Pattern.compile("id=\"(.*?)\""); String line = tInput.readLine(); boolean started = false; while(line != null) { line = line.trim(); if(line.equals("")) { // skip } if(!started && line.startsWith(""); depOut.println(""); depOut.println(""); depOut.println(" " + lkTextFile + ""); depOut.println(" LTH-DEP-SRL"); depOut.println(""); depOut.println(""); paOut.println(""); paOut.println(""); paOut.println(""); paOut.println(" " + lkTextFile + ""); paOut.println(" LTH-DEP-SRL"); paOut.println(""); paOut.println(""); int depIndex = 0; TIntIntHashMap predPosToId = new TIntIntHashMap(); int paIdCounter = 0; Triple> t = CoNLL2008Format.readNextGraph(conllInput); while(t != null) { DepGraph dg = t.first; printDepGraph(depOut, dg, depIndex, ids); paIdCounter = printPreds(paOut, t.third, depIndex, ids, paIdCounter, predPosToId); depIndex += dg.nodes.length - 1; depOut.println(); t = CoNLL2008Format.readNextGraph(conllInput); } depOut.println(""); depOut.println(""); depOut.close(); paOut.println(""); paOut.println(""); conllInput = new Scanner(new FileReader(conll2008File)); depIndex = 0; t = CoNLL2008Format.readNextGraph(conllInput); while(t != null) { paIdCounter = printArgs(paOut, lkTokenFile, t.third, depIndex, ids, paIdCounter, predPosToId); depIndex += t.first.nodes.length - 1; t = CoNLL2008Format.readNextGraph(conllInput); } paOut.println(""); paOut.println(""); paOut.close(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } private static void printDepGraph(PrintWriter out, DepGraph dg, int depIndexStart, TIntArrayList ids) { for(int i = 1; i < dg.nodes.length; i++) { DepNode n = dg.nodes[i]; if(n.parents.length != 1) throw new IllegalArgumentException("Only single-head dependency trees allowed yet"); DepNode p = dg.nodes[i].parents[0]; int childIndex = depIndexStart + n.position - 1; int childTokenId = ids.get(childIndex); if(p.position == 0) { out.println(" " + n.relations[0] + ""); } else { int parentIndex = depIndexStart + p.position - 1; int parentTokenId = ids.get(parentIndex); out.println(" " + n.relations[0] + ""); } } } private static int printPreds(PrintWriter out, List pas, int depIndexStart, TIntArrayList ids, int paIdCounter, TIntIntHashMap predPosToId) { for(PAStructure pa: pas) { int id = ++paIdCounter; int predPosAbsolute = depIndexStart + pa.pred.position - 1; int tokenId = ids.get(predPosAbsolute); predPosToId.put(predPosAbsolute, id); out.println(" " + pa.lemma + ""); } out.println(); return paIdCounter; } private static int printArgs(PrintWriter out, String tokenFileName, List pas, int depIndexStart, TIntArrayList ids, int paIdCounter, TIntIntHashMap predPosToId) { for(PAStructure pa: pas) { int predPosAbsolute = depIndexStart + pa.pred.position - 1; int predId = predPosToId.get(predPosAbsolute); for(int i = 0; i < pa.args.size(); i++) { int id = ++paIdCounter; DepNode arg = pa.args.get(i); String argLabel = pa.argLabels.get(i); int argPosAbsolute = depIndexStart + arg.position - 1; int tokenId = ids.get(argPosAbsolute); out.println(" " + argLabel + ""); } } out.println(); return paIdCounter; } }