package lkformat2; import java.io.*; import java.util.*; import java.util.regex.*; import se.lth.cs.nlp.depsrl.format.*; import se.lth.cs.nlp.nlputils.depgraph.*; import se.lth.cs.nlp.nlputils.core.*; import gnu.trove.*; public class CoNLL2008ToLK2 { private static final String ENCODING = "UTF-8"; public static void main(String[] argv) { String lkTextFile = argv[0]; String lkTokenFile = argv[1]; String conll2008File = argv[2]; String pbDir = argv[3]; String nbDir = argv[4]; String basename = lkTokenFile.replaceAll("\\.tokens\\.xml", ""); String depOutFile = basename + ".depsyntax.xml"; String paOutFile = basename + ".predargs.xml"; try { BufferedReader tInput = new BufferedReader(new FileReader(lkTokenFile)); Scanner conllInput = new Scanner(new FileReader(conll2008File)); PrintWriter depOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(depOutFile), ENCODING)); PrintWriter paOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(paOutFile), ENCODING)); LexicalDB propBank = new LexicalDB(pbDir); LexicalDB nomBank = new LexicalDB(nbDir); TIntArrayList ids = new TIntArrayList(); Pattern pattern = Pattern.compile("id=\"(.*?)\""); String line = tInput.readLine(); boolean started = false; while(line != null) { line = line.trim(); if(line.equals("")) { // skip } if(!started && line.startsWith(""); depOut.println(""); depOut.println(""); depOut.println(" " + lkTextFile + ""); depOut.println(" LTH-DEP-SRL"); depOut.println(""); depOut.println(""); int depIndex = 0; ArrayList predOut = new ArrayList(); ArrayList argOut = new ArrayList(); ArrayList linkOut = new ArrayList(); Triple> t = CoNLL2008Format.readNextGraph(conllInput); while(t != null) { DepGraph dg = t.first; printDepGraph(depOut, dg, depIndex, ids); SRLPostProcess.processPAs(predOut, argOut, linkOut, t.third, dg, propBank, nomBank); depIndex += dg.nodes.length - 1; depOut.println(); t = CoNLL2008Format.readNextGraph(conllInput); } depOut.println(""); depOut.println(""); depOut.close(); paOut.println(""); paOut.println(""); paOut.println(""); paOut.println(" " + lkTextFile + ""); paOut.println(" LTH-DEP-SRL"); paOut.println(""); paOut.println(""); for(String[] ps: predOut) { paOut.println(" " + ps[2] + ""); } paOut.println(""); /* paOut.println(""); for(String[] ps: argOut) { paOut.println(" "); } paOut.println(""); */ paOut.println(""); for(String[] ps: linkOut) { String linkId = ps[0]; String predId = ps[1]; String argEvId = ps[2]; String argTokenId = ps[3]; String pbLabel = ps[4]; String vnLabel = ps[5]; String pbDescr = ps[6]; String connective = ps[7]; String argId; if(argEvId != null) argId = "#" + argEvId; else argId = lkTokenFile + "#" + argTokenId; paOut.print(" "); paOut.print(""); } paOut.println(""); paOut.println(""); paOut.close(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } private static void printDepGraph(PrintWriter out, DepGraph dg, int depIndexStart, TIntArrayList ids) { for(int i = 1; i < dg.nodes.length; i++) { DepNode n = dg.nodes[i]; if(n.parents.length != 1) throw new IllegalArgumentException("Only single-head dependency trees allowed yet"); DepNode p = dg.nodes[i].parents[0]; int childIndex = depIndexStart + n.position - 1; int childTokenId = ids.get(childIndex); if(p.position == 0) { out.println(" " + n.relations[0] + ""); } else { int parentIndex = depIndexStart + p.position - 1; int parentTokenId = ids.get(parentIndex); out.println(" " + n.relations[0] + ""); } } } }