package mpqareader;
import java.util.*;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.io.*;
import se.lth.cs.nlp.depsrl.format.CoNLL2008Format;
import se.lth.cs.nlp.depsrl.format.PAStructure;
import se.lth.cs.nlp.nlputils.annotations.*;
import se.lth.cs.nlp.nlputils.core.ArrayComparator;
import se.lth.cs.nlp.nlputils.core.BinaryOperator;
import se.lth.cs.nlp.nlputils.core.CollectionUtils;
import se.lth.cs.nlp.nlputils.core.Triple;
import se.lth.cs.nlp.nlputils.depgraph.DepGraph;
import se.lth.cs.nlp.nlputils.depgraph.DepNode;
public class MPQAToLK {
private static final String ENCODING = "UTF-8";
private static int dirCounter = 0;
private static void processText(String dir, AnnotatedText text,
Scanner srlInput, BufferedReader sstInput) throws IOException {
dirCounter++;
String docnameFull = (String) text.getProperty("mpqa_file");
String docname = docnameFull.replaceAll("database.mpqa.2.0/docs/", "");
System.out.println(dirCounter + ": " + docname);
docname = docname.replaceAll("/", "_");
String textFileName = dir + "/" + docname + ".lktext.xml";
String tokenFileName = dir + "/" + docname + ".tokens.xml";
String posFileName = dir + "/" + docname + ".pos.xml";
String subjFileName = dir + "/" + docname + ".mpqasubjectivity.xml";
String subjSenFileName = dir + "/" + docname + ".subjsen.xml";
String depFileName = dir + "/" + docname + ".depsyntax.xml";
String srlFileName = dir + "/" + docname + ".predargs.xml";
String sstFileName = dir + "/" + docname + ".sst.xml";
PrintWriter textOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(textFileName), ENCODING));
textOut.println("");
textOut.println("");
textOut.println("");
textOut.println(" " + docnameFull + "");
for(String k: text.properties.keySet()) {
Object v = text.getProperty(k);
if(k.equals("mpqa_file"))
continue;
if(k.equals("source"))
k = "mpqa_source";
if(v instanceof String) {
textOut.println(" " + v + "");
} else {
if(v instanceof ArrayList) {
String vs = v.toString();
vs = vs.substring(1, vs.length() - 1);
textOut.println(" " + vs + "");
}
}
}
textOut.println("");
textOut.print("");
if(docname.equals("20020201_20.45.53-22539")) {
String s = "" + (char) 26;
text.text = text.text.replaceAll(s, "");
}
textOut.println(escapeXML(text.text));
textOut.println("");
textOut.println("");
textOut.close();
PrintWriter tokenOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(tokenFileName), ENCODING));
tokenOut.println("");
tokenOut.println("");
tokenOut.println("");
tokenOut.println(" " + docname + ".lktext.xml");
tokenOut.println(" MPQAToLK");
tokenOut.println("");
PrintWriter posOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(posFileName), ENCODING));
posOut.println("");
posOut.println("");
posOut.println("");
posOut.println(" " + docname + ".lktext.xml");
posOut.println(" LTHPOSTagger");
posOut.println("");
PrintWriter subjSenOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(subjSenFileName), ENCODING));
subjSenOut.println("");
subjSenOut.println("");
subjSenOut.println("");
subjSenOut.println(" " + docname + ".lktext.xml");
subjSenOut.println(" MPQAToLK");
subjSenOut.println("");
PrintWriter subjOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(subjFileName), ENCODING));
subjOut.println("");
subjOut.println("");
subjOut.println("");
subjOut.println(" " + docname + ".lktext.xml");
subjOut.println(" MPQAToLK");
subjOut.println("");
PrintWriter depOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(depFileName), ENCODING));
depOut.println("");
depOut.println("");
depOut.println("");
depOut.println(" " + docname + ".lktext.xml");
depOut.println(" LTH-DEP-SRL");
depOut.println("");
PrintWriter srlOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(srlFileName), ENCODING));
srlOut.println("");
srlOut.println("");
srlOut.println("");
srlOut.println(" " + docname + ".lktext.xml");
srlOut.println(" LTH-DEP-SRL");
srlOut.println("");
PrintWriter sstOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(sstFileName), ENCODING));
sstOut.println("");
sstOut.println("");
sstOut.println("");
sstOut.println(" " + docname + ".lktext.xml");
sstOut.println(" SSTLight");
sstOut.println("");
int idc = findSentenceSpans(text, docname, tokenOut, subjSenOut, srlInput, sstInput, posOut, depOut, srlOut, sstOut);
tokenOut.println("");
tokenOut.close();
posOut.println("");
posOut.close();
depOut.println("");
depOut.close();
srlOut.println("");
srlOut.close();
sstOut.println("");
sstOut.close();
subjSenOut.println("");
subjSenOut.close();
printMPQAAnnotation(text, docname, subjOut, 0);
subjSenOut.close();
subjOut.close();
}
private static final Comparator INTENSITY_COMP
= new ArrayComparator(new String[] {
"extreme", "high", "medium", "low", "none"
});
private static final BinaryOperator POLARITY_OP
= new BinaryOperator() {
public String apply(String t1, String t2) {
if(t1 == null)
return t2;
if(t2 == null)
return t1;
if(t1.equals(t2))
return t1;
if(t1.startsWith("unc") && !t2.startsWith("unc"))
return t2;
if(t2.startsWith("unc") && !t1.startsWith("unc"))
return t1;
boolean unc = t1.startsWith("unc");
if(unc) {
t1 = t1.substring("uncertain-".length());
t2 = t2.substring("uncertain-".length());
}
if(t1.equals("both") || t2.equals("both"))
return unc? "uncertain-both": "both";
if(t1.equals("positive") && t2.equals("negative"))
return unc? "uncertain-both": "both";
if(t2.equals("positive") && t1.equals("negative"))
return unc? "uncertain-both": "both";
if(t1.equals("neutral"))
return unc? ("uncertain-" + t2): t2;
if(t2.equals("neutral"))
return unc? ("uncertain-" + t1): t1;
throw new RuntimeException("unhandled: t1 = " + t1 + ", t2 = " + t2);
}
};
static String[] classifySubjSen(Span sen, AnnotatedText text) {
/*
* FROM THE MPQA DOCUMENTATION *
A sentence was considered subjective if 1 OR 2:
1. the sentence contains a "GATE_direct-subjective"
annotation WITH attribute intensity NOT IN ['low', 'neutral']
AND NOT WITH attribute insubstantial.
2. the sentence contains a "GATE_expressive-subjectivity"
annotation WITH attribute intensity NOT IN ['low']
Otherwise, a sentence was considered objective.
*/
String posInt = "none";
String negInt = "none";
String neuInt = "none";
for(int i = 1; i < text.layers.size(); i++)
for(Span s: (AnnotationLayer) text.layers.get(i)) {
if(s.start < sen.start || s.end > sen.end)
continue;
if(s.label.matches("GATE_direct-subjective|GATE_expressive-subjectivity")) {
if(s.getProperty("insubstantial") != null)
continue;
String intens = (String) s.getProperty("intensity");
if(intens != null)
intens = intens.trim();
else
intens = "low";
if(intens.equals(""))
intens = "low";
if(intens.equals("neutral"))
intens = "low";
String pol = (String) s.getProperty("polarity");
if(pol == null)
pol = "neutral";
else
pol = pol.trim();
if(pol.equals(""))
pol = "neutral";
boolean uncertain = pol.startsWith("uncertain-");
if(uncertain)
pol = pol.substring("uncertain-".length());
if(pol.equals("neutral"))
neuInt = CollectionUtils.min(intens, neuInt, INTENSITY_COMP);
if(pol.matches("negative|both"))
negInt = CollectionUtils.min(intens, negInt, INTENSITY_COMP);
if(pol.matches("positive|both"))
posInt = CollectionUtils.min(intens, posInt, INTENSITY_COMP);
}
}
if(neuInt.equals("low"))
neuInt = "none";
if(posInt.equals("none") && negInt.equals("none") && neuInt.equals("none"))
return null;
else
return new String[] { posInt, neuInt, negInt };
}
private static int sen0, sen1;
private static boolean isXMLTag(String s) {
return s.contains("<") || s.contains(">");
}
private static final Pattern ONE_LETTER = Pattern.compile(".*[A-Za-z].*");
private static int findSentenceSpans(AnnotatedText text,
String baseName,
PrintWriter tokenOut, PrintWriter subjSenOut,
Scanner srlInput, BufferedReader sstInput,
PrintWriter posOut,
PrintWriter depOut,
PrintWriter srlOut,
PrintWriter sstOut) throws IOException {
int[] tokenStarts = (int[]) text.getProperty("token-starts");
int[] tokenEnds = (int[]) text.getProperty("token-ends");
tokenOut.println("");
for(int i = 0; i < text.tokens.length; i++) {
printEntity(text.tokens[i], tokenStarts[i], tokenEnds[i] - 1, i + 1, tokenOut);
}
tokenOut.println("");
int idCounter = text.tokens.length;
int subjIdCounter = 0;
tokenOut.println("");
subjSenOut.println("");
depOut.println("");
//srlOut.println("");
StringBuilder sbP = new StringBuilder();
StringBuilder sbA = new StringBuilder();
StringBuilder sb1 = new StringBuilder();
StringBuilder sb2 = new StringBuilder();
StringBuilder sb3 = new StringBuilder();
StringBuilder sbPOS = new StringBuilder();
StringBuilder sbLemma = new StringBuilder();
int paIdCounter = 0;
int depIdCounter = 0;
int posIdCounter = 0;
int neIdCounter = 0;
//ArrayList>> conllGraphs = new ArrayList();
Span prevSen = null;
if(false && text.getProperty("mpqa_file").equals("database.mpqa.2.0/docs/ula/Article247_66")) {
System.out.println(text.layers.get(0).spans);
System.exit(0);
}
for(Span sen: (AnnotationLayer) text.layers.get(0)) {
sen0++;
int start = sen.tokenStart;
int end = sen.tokenEnd;
if(end - start > 2 && text.tokens[start].equals("AR") && text.tokens[start+1].equals(":")) {
System.out.println("skipped arabic sentence");
continue;
}
if(end - start > 2 && text.tokens[start].equals("EN") && text.tokens[start+1].equals(":")) {
start += 2;
}
while(start < end && start < text.tokens.length && isXMLTag(text.tokens[start]))
start++;
if(start >= end)
continue;
boolean sawWord = false;
for(int i = start; i < end; i++)
if(ONE_LETTER.matcher(text.tokens[i]).matches()) {
sawWord = true;
break;
}
if(!sawWord) {
System.err.println("Skipped this sentence:");
System.err.println("|" + text.text.substring(sen.start, sen.end) + "|");
}
if(prevSen != null) {
if(sen.start < prevSen.end)
throw new RuntimeException("sentences not ordered");
if(sen.tokenStart < prevSen.tokenEnd)
throw new RuntimeException("sentences not ordered");
/*if(text.getProperty("mpqa_file").equals("database.mpqa.2.0/docs/ula/Article247_66")) {
if(sen.end == 894) {
System.out.println(sen);
System.out.prin
}
}*/
}
prevSen = sen;
sen1++;
idCounter++;
tokenOut.print(" ");
String[] ssp = classifySubjSen(sen, text);
if(ssp != null) {
String posi = ssp[0], neui = ssp[1], negi = ssp[2];
if(!posi.equals("none") || !negi.equals("none")
|| !neui.matches("none|low")) {
subjIdCounter++;
subjSenOut.print(" ");
//subjSenOut.print("");
//subjSenOut.print("");
subjSenOut.print("");
subjSenOut.println("");
}
}
Triple> tr = CoNLL2008Format.readNextGraph(srlInput);
//System.out.println("Read graph: " + tr.first);
DepGraph dg = tr.first;
fixLemmas(dg);
HashMap tokenIndices = new HashMap();
int tokenIndex = start;
for(int i = 1; i < dg.nodes.length; i++) {
DepNode n = dg.nodes[i];
while(!n.word.equals(text.tokens[tokenIndex]))
tokenIndex++;
tokenIndices.put(n, tokenIndex);
tokenIndex++;
}
for(int i = 1; i < dg.nodes.length; i++) {
DepNode n = dg.nodes[i];
int childTokenId = 1 + tokenIndices.get(n);
sbPOS.append(" " + n.pos + "\n");
if(n.lemma != null && !n.lemma.equals("_"))
sbLemma.append(" " + escapeXML(n.lemma) + "\n");
if(n.parents.length != 1)
throw new IllegalArgumentException("Only single-head dependency trees allowed yet");
DepNode p = dg.nodes[i].parents[0];
if(p.position == 0) {
depOut.println(" " + n.relations[0] + "");
} else {
int parentTokenId = 1 + tokenIndices.get(p);
depOut.println(" " + n.relations[0] + "");
}
tokenIndex++;
}
paIdCounter = printPreds(sbP, tr.third, tokenIndices, paIdCounter);
paIdCounter = printArgs(sbA, tr.third, tokenIndices, paIdCounter);
if(true) {
String line = sstInput.readLine();
String[] ts = line.split(" ");
if(ts.length % 6 != 0)
throw new RuntimeException("illegal number of tokens");
ArrayList l = compress(ts, tr.first);
ArrayList col1 = iobToSpans(l, 0, tokenIndices, tr.first);
ArrayList col2 = iobToSpans(l, 1, tokenIndices, tr.first);
ArrayList col3 = iobToSpans(l, 2, tokenIndices, tr.first);
for(Span s: col1)
sb1.append("" + s.label + "\n");
for(Span s: col2)
sb2.append("" + s.label + "\n");
for(Span s: col3)
sb3.append("" + s.label + "\n");
}
}
tokenOut.println("");
subjSenOut.println("");
depOut.println("");
posOut.println("");
posOut.print(sbPOS);
posOut.println("");
posOut.println("");
posOut.print(sbLemma);
posOut.println("");
srlOut.println("");
srlOut.print(sbP);
srlOut.println("");
srlOut.println("");
srlOut.print(sbA);
srlOut.println("");
sstOut.println("");
sstOut.print(sb1);
sstOut.println("");
sstOut.println("");
sstOut.print(sb2);
sstOut.println("");
sstOut.println("");
sstOut.print(sb3);
sstOut.println("");
return subjIdCounter;
}
private static void fixLemmas(DepGraph dg) {
for(int i = 1; i < dg.nodes.length; i++) {
if(dg.nodes[i].lemma != null && !dg.nodes[i].lemma.equals("_"))
dg.nodes[i].lemma = dg.nodes[i].lemma.toLowerCase();
else {
if(dg.nodes[i].pos.matches("UH|EX|WDT|WRB|WP|WP\\$|DT|IN|TO|MD|PRP|VB|NN|JJ|CC|PDT|FW|NNP|NNPS|RB|RP|CD|\\.|\\,|\\#|\\$|:|\\(|\\)"))
dg.nodes[i].lemma = dg.nodes[i].word.toLowerCase();
// holes in the original lemma lexicon
else if(dg.nodes[i].word.toLowerCase().equals("biased"))
dg.nodes[i].lemma = "bias";
else if(dg.nodes[i].word.toLowerCase().equals("aced"))
dg.nodes[i].lemma = "ace";
else if(dg.nodes[i].word.toLowerCase().equals("barreled"))
dg.nodes[i].lemma = "barrel";
else if(dg.nodes[i].word.toLowerCase().equals("bogged"))
dg.nodes[i].lemma = "bog";
else if(dg.nodes[i].word.toLowerCase().equals("bruised"))
dg.nodes[i].lemma = "bruise";
else if(dg.nodes[i].word.toLowerCase().equals("criss-crossed"))
dg.nodes[i].lemma = "criss-cross";
else if(dg.nodes[i].word.toLowerCase().equals("delegated"))
dg.nodes[i].lemma = "delegate";
else if(dg.nodes[i].word.toLowerCase().equals("delisted"))
dg.nodes[i].lemma = "delist";
else if(dg.nodes[i].word.toLowerCase().equals("disguised"))
dg.nodes[i].lemma = "disguise";
else if(dg.nodes[i].word.toLowerCase().equals("downsized"))
dg.nodes[i].lemma = "downsize";
else if(dg.nodes[i].word.toLowerCase().equals("evacuated"))
dg.nodes[i].lemma = "evacuate";
else if(dg.nodes[i].word.toLowerCase().equals("evaluated"))
dg.nodes[i].lemma = "evaluate";
else if(dg.nodes[i].word.toLowerCase().equals("faxed"))
dg.nodes[i].lemma = "fax";
else if(dg.nodes[i].word.toLowerCase().equals("emailed"))
dg.nodes[i].lemma = "email";
else if(dg.nodes[i].word.toLowerCase().equals("graduated"))
dg.nodes[i].lemma = "graduate";
else if(dg.nodes[i].word.toLowerCase().equals("guided"))
dg.nodes[i].lemma = "guide";
else if(dg.nodes[i].word.toLowerCase().equals("headquartered"))
dg.nodes[i].lemma = "headquarter";
else if(dg.nodes[i].word.toLowerCase().equals("inched"))
dg.nodes[i].lemma = "inch";
else if(dg.nodes[i].word.toLowerCase().equals("influenced"))
dg.nodes[i].lemma = "influence";
else if(dg.nodes[i].word.toLowerCase().equals("occured"))
dg.nodes[i].lemma = "occur";
else if(dg.nodes[i].word.toLowerCase().equals("outbade"))
dg.nodes[i].lemma = "outbid";
else if(dg.nodes[i].word.toLowerCase().equals("overpriced"))
dg.nodes[i].lemma = "overprice";
else if(dg.nodes[i].word.toLowerCase().equals("prepped"))
dg.nodes[i].lemma = "prep";
else if(dg.nodes[i].word.toLowerCase().equals("quoted"))
dg.nodes[i].lemma = "quote";
else if(dg.nodes[i].word.toLowerCase().equals("radicalized"))
dg.nodes[i].lemma = "radicalize";
else if(dg.nodes[i].word.toLowerCase().equals("readmitted"))
dg.nodes[i].lemma = "readmit";
else if(dg.nodes[i].word.toLowerCase().equals("redefined"))
dg.nodes[i].lemma = "redefine";
else if(dg.nodes[i].word.toLowerCase().equals("reinstalled"))
dg.nodes[i].lemma = "reinstall";
else if(dg.nodes[i].word.toLowerCase().equals("reloaded"))
dg.nodes[i].lemma = "reload";
else if(dg.nodes[i].word.toLowerCase().equals("re-occupied"))
dg.nodes[i].lemma = "re-occupy";
else if(dg.nodes[i].word.toLowerCase().equals("situated"))
dg.nodes[i].lemma = "situated";
else if(dg.nodes[i].word.toLowerCase().equals("self-inflicted"))
dg.nodes[i].lemma = "self-inflict";
else if(dg.nodes[i].word.toLowerCase().equals("sourced"))
dg.nodes[i].lemma = "source";
else if(dg.nodes[i].word.toLowerCase().equals("spirited"))
dg.nodes[i].lemma = "spirit";
else if(dg.nodes[i].word.toLowerCase().equals("spotted"))
dg.nodes[i].lemma = "spot";
else if(dg.nodes[i].word.toLowerCase().equals("unexplored"))
dg.nodes[i].lemma = "unexplore";
else if(dg.nodes[i].word.toLowerCase().equals("wounded"))
dg.nodes[i].lemma = "wound";
}
}
}
private static void print(ArrayList l) {
for(String[] ss: l) {
System.out.println(ss[0] + "\t" + ss[1] + "\t" + ss[2]);
}
System.exit(0);
}
private static ArrayList iobToSpans(ArrayList ss, int col, HashMap tokenIndices, DepGraph depGraph) {
//System.out.println(tokenIndices);
//HashMap m = new HashMap();
int[] m = new int[depGraph.nodes.length - 1];
for(int i = 1; i < depGraph.nodes.length; i++)
m[i-1] = tokenIndices.get(depGraph.nodes[i]);
ArrayList out = new ArrayList();
Span current = null;
for(int i = 0; i < ss.size(); i++) {
String iobtag = ss.get(i)[col];
if(current != null
&& (iobtag.equals("0")
|| iobtag.startsWith("B-")
|| iobtag.startsWith("I-") && !current.label.equals(iobtag.substring(2)))) {
current.tokenEnd = 1 + m[i-1];
out.add(current);
current = null;
}
if(current == null && (iobtag.startsWith("I-") || iobtag.startsWith("B-"))) {
current = new Span();
current.label = iobtag.substring(2);
current.tokenStart = 1 + m[i];
}
}
if(current != null) {
current.tokenEnd = 1 + m[m.length-1];
out.add(current);
}
return out;
}
private static ArrayList compress(String[] ss, DepGraph dg) {
ArrayList out = new ArrayList();
int dgix = 1;
int ssix = 0;
while(ssix < ss.length / 6) {
int ix0 = ssix;
String s1 = ss[6*ssix];
String s2 = dg.nodes[dgix].word;
while(s1.length() < s2.length()) {
ssix++;
s1 = s1 + ss[6*ssix];
}
if(s1.length() > s2.length()) {
throw new RuntimeException("s1 = " + s1 + ", s2 = " + s2);
}
String[] row = new String[3];
for(int i = 0; i < 3; i++) {
String tag = null;
for(int ix = ix0; ix <= ssix; ix++) {
String t = ss[6*ix + 3 + i];
if(t.startsWith("B-")) {
tag = t;
break;
}
}
if(tag == null)
for(int ix = ix0; ix <= ssix; ix++) {
String t = ss[6*ix + 3 + i];
if(t.startsWith("I-")) {
tag = t;
break;
}
}
if(tag == null)
tag = "0";
row[i] = tag;
}
out.add(row);
ssix++;
dgix++;
}
if(dgix != dg.nodes.length)
throw new RuntimeException("nodes left");
return out;
}
private static int printPreds(StringBuilder out, List pas,
HashMap tokenIndices, int paIdCounter) {
for(PAStructure pa: pas) {
int id = ++paIdCounter;
int tokenId = 1 + tokenIndices.get(pa.pred);
out.append(" " + pa.lemma + "\n");
}
out.append("\n");
return paIdCounter;
}
private static int printArgs(StringBuilder out,
List pas,
HashMap tokenIndices, int paIdCounter) {
for(PAStructure pa: pas) {
int predTokenId = 1 + tokenIndices.get(pa.pred);
for(int i = 0; i < pa.args.size(); i++) {
int id = ++paIdCounter;
DepNode arg = pa.args.get(i);
String argLabel = pa.argLabels.get(i);
int argTokenId = 1 + tokenIndices.get(arg);
//out.println(" "
// + argLabel + "");
out.append(" "
+ argLabel + "\n");
}
}
out.append("\n");
return paIdCounter;
}
static void printEntity(String l, int start, int end,
int id, PrintWriter out) {
StringBuilder sb = new StringBuilder(" ");
else
sb.append("\" on=\"#" + start + "\">");
sb.append(escapeXML(l));
sb.append("");
out.println(sb);
}
// slut saxat
private static void printMPQAAnnotation(AnnotatedText text,
String baseName,
PrintWriter subjOut,
int idCounter) {
final boolean debug;
if(false && baseName.equals("20020306_15.02.54-18922")) {
debug = true;
} else
debug = false;
for(AnnotationLayer l: text.layers) {
for(Iterator it = l.iterator(); it.hasNext(); ) {
Span s = it.next();
if(s.hasProperty("implicit") && s.getProperty("implicit").equals("true")) {
s.end = s.start;
s.tokenEnd = s.tokenStart;
continue;
}
if(s.start < s.end)
continue;
if(!s.label.equals("GATE_agent")) {
it.remove();
continue;
}
String id = (String) s.getProperty("id");
if(id != null) {
id = id.trim();
if(id != null && id.equals("w"))
continue;
if(id != null && id.equals("implicit"))
continue;
}
it.remove();
// TODO ska vi verkligen ta bort alla tomma agenter?
}
}
if(baseName.equals("20020131_20.58.51-26741")) {
// bug: insubstantial and nested-source confused on one item
for(AnnotationLayer l: text.layers)
for(Span s: l) {
String ns = (String) s.getProperty("nested-source");
if(ns != null && ns.equals("c2")) {
s.setProperty("nested-source", s.getProperty("insubstantial"));
s.setProperty("insubstantial", ns);
break;
}
}
} else if(baseName.equals("xbank_wsj_0610")) {
// bug: missing w, implicit
Span w = new Span();
w.start = w.end = 0;
w.label = "GATE_agent";
w.setProperty("id", "w");
text.layers.get(1).add(w);
Span imp = new Span();
imp.start = w.end = 0;
imp.label = "GATE_agent";
imp.setProperty("id", "implicit");
text.layers.get(1).add(imp);
} else if(baseName.equals("xbank_wsj_0122")) {
for(AnnotationLayer l: text.layers)
for(Span s: l) {
String al = (String) s.getProperty("attitude-link");
if(al != null && al.matches("agreement"))
s.properties.remove("attitude-link");
}
} else if(baseName.equals("xbank_wsj_0557")) {
for(AnnotationLayer l: text.layers)
for(Span s: l) {
String al = (String) s.getProperty("attitude-link");
if(al != null && al.matches("expsale"))
s.properties.remove("attitude-link");
}
} else if(baseName.equals("xbank_wsj_0376")) {
for(AnnotationLayer l: text.layers)
for(Span s: l) {
String al = (String) s.getProperty("attitude-link");
if(al != null && al.matches("ew"))
s.properties.remove("attitude-link");
}
} else if(baseName.equals("xbank_wsj_0187")) {
for(AnnotationLayer l: text.layers)
for(Span s: l) {
String al = (String) s.getProperty("attitude-link");
if(al != null && al.matches("soffer"))
s.properties.remove("attitude-link");
}
}
HashMap midToLKid = new HashMap();
HashMap spanToLKid = new HashMap();
/* Agents may reference forward or self -- we need a first pass. */
for(AnnotationLayer l: text.layers) {
Collections.sort(l.spans, Span.ByLeftOrder.instance());
for(Span s: l) {
if(s.label.equals("GATE_agent")) {
idCounter++;
spanToLKid.put(s, "" + idCounter);
String id = (String) s.getProperty("id");
if(id != null) {
id = id.trim();
midToLKid.put(id, "" + idCounter);
}
}
}
}
subjOut.println("");
for(AnnotationLayer l: text.layers) {
for(Span s: l) {
if(s.label.equals("GATE_agent")) {
//if(debug)
// System.out.println(s);
//idCounter++;
//subjOut.print(" = 0) {
// s.tokenStart > 0 needed TODO kolla
// bug in 20020427_22.07.25-26605
subjOut.print(" start=\"#" + (s.tokenStart+1) + "\"");
subjOut.print(" end=\"#" + s.tokenEnd + "\"");
}
subjOut.print(">");
// todo hantera ns även om agenten kommer senare
subjOut.print("");
subjOut.print("");
if(false) {
if(s.start < s.end) {
String t = text.text.substring(s.start, s.end);
subjOut.print(" ");
}
subjOut.print(" ");
}
subjOut.println();
}
}
}
if(baseName.equals("20010715_00.31.31-4544")) {
// bug: mteam -> team
String id = midToLKid.get("team");
midToLKid.put("mteam",id);
} else if(baseName.equals("20020509_22.11.01-7259")) {
// bug: devcon -> devcoun
String id = midToLKid.get("devcoun");
midToLKid.put("devcon",id);
} else if(baseName.equals("temp_fbis_20.45.06-5529")) {
// bug: ungovint -> usgovint
String id = midToLKid.get("usgovint");
midToLKid.put("ungovint",id);
} else if(baseName.equals("xbank_wsj_0610")) {
// bug: wel missing
midToLKid.put("wel", "24");
} else if(baseName.equals("xbank_wsj_0376")) {
// bug: ana -> analysts
String id = midToLKid.get("analysts");
midToLKid.put("ana",id);
} else if(baseName.equals("xbank_wsj_0778")) {
// bug: bdl missing
midToLKid.put("bdl", "40");
}
subjOut.println("");
subjOut.print(" ");
subjOut.println("");
for(AnnotationLayer l: text.layers) {
for(Span s: l) {
if(s.label.equals("GATE_expressive-subjectivity")) {
idCounter++;
subjOut.print(" = 0) {
subjOut.print(" start=\"#" + (s.tokenStart+1) + "\"");
subjOut.print(" end=\"#" + s.tokenEnd + "\"");
}
subjOut.print(">");
subjOut.print("");
subjOut.print("");
if(false) {
if(s.start < s.end) {
String t = text.text.substring(s.start, s.end);
subjOut.print(" ");
}
}
subjOut.println();
}
}
}
subjOut.println("");
subjOut.println("");
for(AnnotationLayer l: text.layers) {
for(Span s: l) {
if(s.label.equals("GATE_objective-speech-event")) {
idCounter++;
subjOut.print(" = 0) {
subjOut.print(" start=\"#" + (s.tokenStart+1) + "\"");
subjOut.print(" end=\"#" + s.tokenEnd + "\"");
}
subjOut.print(">");
subjOut.print("");
subjOut.print("");
if(false) {
if(s.start < s.end) {
String t = text.text.substring(s.start, s.end);
subjOut.print(" ");
}
subjOut.print(" ");
}
subjOut.println();
}
}
}
subjOut.println("");
subjOut.println("");
for(AnnotationLayer l: text.layers) {
for(Span s: l) {
if(s.label.equals("GATE_target")) {
idCounter++;
subjOut.print(" = 0) {
subjOut.print(" start=\"#" + (s.tokenStart+1) + "\"");
subjOut.print(" end=\"#" + s.tokenEnd + "\"");
}
subjOut.print(">");
subjOut.print("");
subjOut.print("");
if(s.hasProperty("id")) {
String id = (String) s.getProperty("id");
midToLKid.put(id, "" + idCounter);
}
if(false) {
if(s.start < s.end) {
String t = text.text.substring(s.start, s.end);
subjOut.print(" ");
}
subjOut.print(" ");
}
subjOut.println();
}
}
}
subjOut.println("");
subjOut.println("");
for(AnnotationLayer l: text.layers) {
for(Span s: l) {
if(s.label.equals("GATE_attitude")) {
idCounter++;
subjOut.print(" = 0) {
subjOut.print(" start=\"#" + (s.tokenStart+1) + "\"");
subjOut.print(" end=\"#" + s.tokenEnd + "\"");
}
if(s.hasProperty("id")) {
String id = (String) s.getProperty("id");
midToLKid.put(id, "" + idCounter);
}
subjOut.print(">");
subjOut.print("");
subjOut.print("");
if(false) {
if(s.start < s.end) {
String t = text.text.substring(s.start, s.end);
subjOut.print(" ");
}
subjOut.print(" ");
}
subjOut.println();
}
}
}
subjOut.println("");
if(baseName.equals("xbank_wsj_0187")) {
// bug:
String id = midToLKid.get("agreeinprinciple");
midToLKid.put("agreeincprinciple",id);
}
subjOut.println("");
for(AnnotationLayer l: text.layers) {
for(Span s: l) {
if(s.label.equals("GATE_direct-subjective")) {
idCounter++;
subjOut.print(" = 0) {
subjOut.print(" start=\"#" + (s.tokenStart+1) + "\"");
subjOut.print(" end=\"#" + s.tokenEnd + "\"");
}
subjOut.print(">");
subjOut.print("");
subjOut.print("");
if(s.start < s.end) {
String t = text.text.substring(s.start, s.end);
if(false)
subjOut.print(" ");
}
subjOut.println();
}
}
}
subjOut.println("");
subjOut.println("");
subjOut.flush();
if(debug)
System.exit(0);
}
private static void printIdList(String s, String att, String baseName,
HashMap ids, PrintWriter pw) {
String[] ss = s.trim().split("\\s*,\\s*");
pw.print(" " + att + "=\"");
boolean first = true;
for(int i = 0; i < ss.length; i++) {
String lid = ids.get(ss[i]);
if(lid == null) {
//throw new RuntimeException("unknown id |" + ss[i] + "|");
System.err.println("*** Warning: unknown id |" + ss[i] + "|");
continue;
}
if(!first)
pw.print(",");
else
first = false;
pw.print("#" + lid);
}
pw.print("\"");
}
private static String escapeXML(String s) {
s = s.replaceAll("&", "&");
s = s.replaceAll("\"", """);
s = s.replaceAll("<", "<");
s = s.replaceAll(">", ">");
return s;
}
public static void main(String[] argv) {
String dir = argv[1];
String srlFile = argv[2];
String sstFile = argv[3];
try {
//Scanner srlInput = new Scanner(new File(srlFile));
Scanner srlInput;
if(srlFile.endsWith(".gz")) {
InputStream is = new GZIPInputStream(new FileInputStream(srlFile));
srlInput = new Scanner(is);
} else
srlInput = new Scanner(new File(srlFile));
BufferedReader sstInput;
if(sstFile.endsWith(".gz")) {
InputStream is = new GZIPInputStream(new FileInputStream(sstFile));
sstInput = new BufferedReader(new InputStreamReader(is));
} else
sstInput = new BufferedReader(new FileReader(sstFile));
Iterator i = MPQAReader.processDirectory(argv[0]);
while(i.hasNext()) {
AnnotatedText t = i.next();
processText(dir, t, srlInput, sstInput);
}
System.out.println("sen0 = " + sen0);
System.out.println("sen1 = " + sen1);
Triple> tr = CoNLL2008Format.readNextGraph(srlInput);
if(tr != null) {
throw new RuntimeException("did not read all trees");
}
} catch(Exception e) {
e.printStackTrace();
}
}
}