package mpqareader;
import java.io.*;
import java.util.*;
import java.util.regex.*;
import se.lth.cs.nlp.nlputils.annotations.*;
import se.lth.cs.nlp.nlputils.core.Util;
import se.lth.cs.nlp.nlputils.core.Strings;
public class MPQAReader {
private static final boolean FILTER_TAGS = true;
static final Pattern PROP_PAT = Pattern.compile("([a-z\\-]+)=\"([^\"]*)\"\\s*");
static final Pattern TAG_PAT = Pattern.compile("<.*?>");
static final Pattern EXTENDED_TAG_PAT = Pattern.compile("()|(<.*?>)|\\.START|-----Original Message-----|((^|\\n)(From:|Sent:|To:|Date:|Subject:|DOCUMENT|TRANSLATION:|DATE:|Alias:|Marital status:|Message-ID:|Mime-Version:|Content-Type:|Content-Transfer-Encoding:|X-(.*?):).*)+");
static class MPQADirIterator implements Iterator {
private String dirName;
private File[] subdirs1;
private File[] subdirs2;
private int index1;
private int index2;
private boolean removeTags;
private Iterator fileListIterator;
MPQADirIterator(String dirName, boolean removeTags, Collection files) {
this.dirName = dirName.substring(0, dirName.indexOf("database.mpqa.2.0"));
if(files == null) {
File dir = new File(dirName + File.separatorChar + "docs");
subdirs1 = dir.listFiles();
if(subdirs1 == null)
throw new IllegalArgumentException("Could not read directory");
index1 = -1;
index2 = -1;
} else
fileListIterator = files.iterator();
this.removeTags = removeTags;
}
MPQADirIterator(String dirName) {
this(dirName, false, null);
}
/* TODO state checks */
//private AnnotatedText nextValue;
public boolean hasNext() {
if(fileListIterator != null)
return fileListIterator.hasNext();
//if(nextValue != null)
// return true;
if(index2 == -1 || index2 == subdirs2.length - 1) {
index1++;
if(index1 >= subdirs1.length)
return false;
File subdir = subdirs1[index1];
subdirs2 = subdir.listFiles();
if(subdirs2 == null)
throw new RuntimeException("could not list directory: " + subdir);
if(subdirs2.length == 0)
throw new RuntimeException("directory is empty");
index2 = 0;
return true;
} else {
index2++;
return true;
}
}
private static char[] buf = new char[10000];
public AnnotatedText next() {
try {
File subfile2;
if(fileListIterator == null)
subfile2 = subdirs2[index2];
else
subfile2 = new File(dirName + fileListIterator.next());
//System.out.println(dirName);
//System.out.println(subfile2);
//System.exit(0);
if(subfile2.getAbsolutePath().contains("database.mpqa.2.0/docs/20020203/20.46.36-9539")) {
warn("*** Warning: skipped " + subfile2);
//System.exit(0);
if(!hasNext())
throw new RuntimeException("!!!");
subfile2 = subdirs2[index2];
}
//System.out.println(subfile2);
BufferedReader br = new BufferedReader(new FileReader(subfile2));
/*StringBuilder sb = new StringBuilder();
String line = br.readLine();
while(line != null) {
sb.append(line);
sb.append("\n"); // osäker på windows här
line = br.readLine();
}
br.close();*/
StringBuilder sb = new StringBuilder();
while(true) {
int n = br.read(buf);
if(n == -1)
break;
sb.append(new String(buf, 0, n));
}
String mpqaFileName = subfile2.getPath().replaceFirst(".*database.mpqa.2.0/docs", "database.mpqa.2.0/docs");
AnnotatedText text = new AnnotatedText();
text.setProperty("mpqa_file", mpqaFileName);
text.text = sb.toString();
String annDir = subfile2.getPath().replaceFirst("docs", "man_anns");
String metaPath = subfile2.getPath().replaceFirst("docs", "meta_anns");
for(File annFile: new File(annDir).listFiles()) {
AnnotationLayer l = new AnnotationLayer();
l.label = annFile.getName();
BufferedReader br2 = new BufferedReader(new FileReader(annFile));
String line2 = br2.readLine();
while(line2 != null) {
line2 = line2.trim();
if(!line2.equals("") && !line2.startsWith("#")) {
String[] ss = line2.split("\\t");
if(ss.length != 4 && ss.length != 5) {
System.out.println("line2 = " + line2);
System.out.println(Arrays.asList(ss));
throw new RuntimeException("ss.length != 5");
}
if(!ss[2].equals("string"))
throw new RuntimeException("ss[2] != string");
String[] ss2 = ss[1].split("\\,");
if(ss2.length != 2)
throw new RuntimeException("ss2.length != 2");
int start = Integer.parseInt(ss2[0]);
int end = Integer.parseInt(ss2[1]);
Span span = new Span(start, end, ss[3]);
span.id = ss[0];
if(ss.length == 5) {
ss[4] = ss[4].trim();
if(ss[4].length() > 0) {
Matcher m = PROP_PAT.matcher(ss[4]);
int ix = 0;
while(m.find()) {
String k = m.group(1);
String v = m.group(2);
span.setProperty(k, v);
if(m.start() != ix)
throw new RuntimeException("!!!");
ix = m.end();
}
if(ix != ss[4].length()) {
throw new RuntimeException("!!!");
}
/*
String[] ss3 = ss[4].split("(?<=\"([^\"]*)\")\\s+");
for(String p: ss3) {
String[] ss4 = p.split("=");
if(ss4.length != 2) {
System.out.println(Arrays.toString(ss));
System.out.println(ss[4]);
System.out.println(Arrays.asList(ss4));
throw new RuntimeException("ss4.length != 2");
}
if(!ss4[1].startsWith("\"") || !ss4[1].endsWith("\"")) {
System.out.println(Arrays.toString(ss));
System.out.println(ss[4]);
System.out.println(Arrays.asList(ss4));
throw new RuntimeException("unquoted");
}
if(ss4[1].length() < 2) {
System.out.println(Arrays.toString(ss));
System.out.println(ss[4]);
System.out.println(Arrays.asList(ss4));
throw new RuntimeException("too short");
}
span.setProperty(ss4[0], ss4[1].substring(1, ss4[1].length() - 1));
}
*/
}
}
l.add(span);
}
line2 = br2.readLine();
}
text.layers.add(l);
}
File metaFile = new File(metaPath);
if(metaFile.exists()) {
BufferedReader br2 = new BufferedReader(new FileReader(metaFile));
String line2 = br2.readLine();
while(line2 != null) {
if(!line2.startsWith("#")) {
String[] ss = line2.split("\\t");
if(ss.length != 4 && ss.length != 5) {
System.out.println("line2 = " + line2);
System.out.println(Arrays.asList(ss));
throw new RuntimeException("ss.length != 5");
}
String v = ss.length == 4? "": ss[4];
String k = ss[3].substring(5);
if(k.matches("region|subregion|country|topic")) {
ArrayList l = (ArrayList) text.getProperty(k);
if(l == null) {
l = new ArrayList();
text.setProperty(k, l);
}
l.add(v);
} else {
String v2 = (String) text.getProperty(k);
if(v2 != null)
throw new RuntimeException("property " + k + " already set");
text.setProperty(k, v);
}
}
line2 = br2.readLine();
}
}
String oldText = text.text;
if(FILTER_TAGS) {
//text.replaceAll("<(.*?)>", "");
}
Collections.sort(text.layers,
new Comparator() {
public int compare(AnnotationLayer o1,
AnnotationLayer o2) {
return -o1.label.compareTo(o2.label);
}
});
if(true) {
// fix two offset bugs
if(mpqaFileName.equals("database.mpqa.2.0/docs/xbank/wsj_0583")) {
for(AnnotationLayer l: text.layers)
for(Span s: l)
if(s.start >= 2268) {
s.start -= 3;
s.end -= 3;
warn("Fixing error in wsj_0583");
}
/*
try {
text.splitLayers();
for(AnnotationLayer l: text.layers) {
RawXMLAnnotation.instance().printLayers(System.out, text.text, Collections.singletonList(l));
}
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
System.exit(0);
*/
} else if(mpqaFileName.equals("database.mpqa.2.0/docs/20020203/20.46.36-9539")) {
throw new RuntimeException("This text is buggy and cannot be fixed");
} else if(mpqaFileName.equals("database.mpqa.2.0/docs/ula/Article247_66")) {
for(Iterator it = text.layers.get(0).iterator(); it.hasNext(); ) {
Span s = it.next();
if(s.start == 880 && s.end == 894) {
it.remove();
break;
}
}
} else if(mpqaFileName.equals("database.mpqa.2.0/docs/20020221/21.03.10-21966")) {
for(AnnotationLayer l: text.layers)
for(Span s: l)
if(s.start == 641)
s.start++;
//System.out.println(text.layers);
} else if(mpqaFileName.equals("database.mpqa.2.0/docs/ula/im_401b_e73i32c22_031705-2")) {
/*for(AnnotationLayer l: text.layers)
for(Span s: l)
if(s.start == 1286 || s.start == 1709 || s.start == 1827)
s.start++;*/
//System.out.println(text.layers);
}
}
if(removeTags)
removeTags(text);
tokenize(text);
Collections.sort(text.layers.get(0).spans, Span.ByLeftOrder.instance());
//for(Span sp: (AnnotationLayer) text.layers.get(0))
// System.out.println(sp);
Span prevSen = null;
for(Span sentence: (AnnotationLayer) text.layers.get(0)) {
if(!sentence.label.equals("GATE_sentence"))
throw new RuntimeException("not sentence");
if(sentence.tokenStart == -1 && sentence.start >= 0)
sentence.tokenStart = 0;
if(sentence.tokenEnd == -1 && sentence.end >= 0)
//sentence.tokenEnd = 0;
sentence.tokenEnd = sentence.tokenStart;
if(false && (sentence.tokenStart < 0 || sentence.tokenStart >= text.tokens.length)) {
System.out.println("---");
System.out.println("Old text:");
System.out.println(oldText);
System.out.println("---");
System.out.println("New text:");
System.out.println(text.text);
System.out.println("---");
System.out.println("Sentence snippet: |" + text.text.substring(sentence.start, sentence.end) + "|");
System.out.flush();
for(int i = 0; i < text.tokens.length; i++) {
System.out.println(i + " |" + text.tokens[i] + "|");
}
throw new RuntimeException("1. sentence = " + sentence + ", tokens.length = " + text.tokens.length);
}
if(sentence.tokenEnd < sentence.tokenStart) {
//for(Span sp: (AnnotationLayer) text.layers.get(0))
// System.out.println(sp);
throw new RuntimeException("negative");
}
if(sentence.tokenEnd > sentence.tokenStart
&& sentence.tokenEnd > text.tokens.length) {
for(int i = 0; i < text.tokens.length; i++) {
System.out.println(i + "\t" + text.tokens[i]);
}
System.out.println("sen = |" + text.text.substring(sentence.start, sentence.end) + "|");
throw new RuntimeException("2. sentence = " + sentence);
}
if(prevSen != null && sentence.tokenEnd > sentence.tokenStart) {
if(sentence.start < prevSen.end)
throw new RuntimeException("sentences not ordered [char]");
if(sentence.tokenStart < prevSen.tokenEnd) {
System.err.println(mpqaFileName);
String s1 = text.text.substring(sentence.start, sentence.end);
String s2 = text.text.substring(prevSen.start, prevSen.end);
System.err.println("|" + s1 + "|");
System.err.println("|" + s2 + "|");
for(int i = prevSen.tokenStart; i < prevSen.tokenEnd; i++) {
System.err.println(text.tokens[i]);
}
System.err.println();
for(int i = sentence.tokenStart; i < sentence.tokenEnd; i++) {
System.err.println(text.tokens[i]);
}
try {
RawXMLAnnotation.instance().printLayers(System.err, text.text, Collections.singletonList(text.layers.get(0)));
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
throw new RuntimeException("sentences not ordered [token] " + prevSen + ", " + sentence);
}
}
if(sentence.tokenEnd > sentence.tokenStart)
prevSen = sentence;
}
return text;
} catch(IOException e) {
throw new RuntimeException(e);
}
}
private void removeTags(AnnotatedText text) {
String oldText = text.text;
StringBuffer sb = new StringBuffer();
Matcher m = EXTENDED_TAG_PAT.matcher(text.text);
while(m.find()) {
String repl = m.group();
repl = repl.replaceAll(".", " ");
m.appendReplacement(sb, repl);
}
m.appendTail(sb);
text.text = sb.toString();
/*if(!text.text.equals(oldText)) {
System.out.println("***** Including tags *****");
System.out.println(oldText);
System.out.println("***** Removed tags *****");
System.out.println(text.text);
}*/
}
public void remove() {
throw new UnsupportedOperationException("unsupported");
}
}
private static void warn(Object msg) {
if(false)
System.err.println(msg);
}
public static Iterator processDirectory(String dirName) {
return new MPQADirIterator(dirName);
}
public static Iterator processDirectory(String dirName,
Collection selectedFiles) {
return new MPQADirIterator(dirName, true, selectedFiles);
}
public static void main0(String[] argv) {
try {
String dirName = argv[0];
File dir = new File(dirName + File.separatorChar + "docs");
PrintWriter pw = new PrintWriter(System.out);
for(File subdir: dir.listFiles()) {
for(File subfile2: subdir.listFiles()) {
System.out.println(subfile2);
//BufferedReader br = new BufferedReader(new FileReader(subfile2));
/*StringBuilder sb = new StringBuilder();
String line = br.readLine();
while(line != null) {
sb.append(line);
sb.append("\n"); // osäker på windows här
line = br.readLine();
}
br.close();*/
final int BUF_SIZE = 10000;
byte[] buf = new byte[BUF_SIZE];
ByteArrayOutputStream bos = new ByteArrayOutputStream();
InputStream bis = new BufferedInputStream(new FileInputStream(subfile2));
int n = bis.read(buf);
while(n != -1) {
bos.write(buf);
n = bis.read(buf);
}
AnnotatedText text = new AnnotatedText();
text.text = new String(bos.toByteArray()); // sb.toString();
String annDir = subfile2.getPath().replaceFirst("docs", "man_anns");
String metaPath = subfile2.getPath().replaceFirst("docs", "meta_anns");
for(File annFile: new File(annDir).listFiles()) {
System.out.println(annFile);
AnnotationLayer l = new AnnotationLayer();
l.label = annFile.getName();
BufferedReader br2 = new BufferedReader(new FileReader(annFile));
String line2 = br2.readLine();
while(line2 != null) {
line2 = line2.trim();
if(!line2.equals("") && !line2.startsWith("#")) {
String[] ss = line2.split("\\t");
if(ss.length != 4 && ss.length != 5) {
System.out.println("line2 = " + line2);
System.out.println(Arrays.asList(ss));
throw new RuntimeException("ss.length != 5");
}
if(!ss[2].equals("string"))
throw new RuntimeException("ss[2] != string");
String[] ss2 = ss[1].split("\\,");
if(ss2.length != 2)
throw new RuntimeException("ss2.length != 2");
int start = Integer.parseInt(ss2[0]);
int end = Integer.parseInt(ss2[1]);
Span span = new Span(start, end, ss[3]);
span.id = ss[0];
if(ss.length == 5) {
ss[4] = ss[4].trim();
if(ss[4].length() > 0) {
Matcher m = PROP_PAT.matcher(ss[4]);
int ix = 0;
while(m.find()) {
String k = m.group(1);
String v = m.group(2);
span.setProperty(k, v);
if(m.start() != ix)
throw new RuntimeException("!!!");
ix = m.end();
}
if(ix != ss[4].length()) {
throw new RuntimeException("!!!");
}
/*
String[] ss3 = ss[4].split("(?<=\"([^\"]*)\")\\s+");
for(String p: ss3) {
String[] ss4 = p.split("=");
if(ss4.length != 2) {
System.out.println(Arrays.toString(ss));
System.out.println(ss[4]);
System.out.println(Arrays.asList(ss4));
throw new RuntimeException("ss4.length != 2");
}
if(!ss4[1].startsWith("\"") || !ss4[1].endsWith("\"")) {
System.out.println(Arrays.toString(ss));
System.out.println(ss[4]);
System.out.println(Arrays.asList(ss4));
throw new RuntimeException("unquoted");
}
if(ss4[1].length() < 2) {
System.out.println(Arrays.toString(ss));
System.out.println(ss[4]);
System.out.println(Arrays.asList(ss4));
throw new RuntimeException("too short");
}
span.setProperty(ss4[0], ss4[1].substring(1, ss4[1].length() - 1));
}
*/
}
}
l.add(span);
}
line2 = br2.readLine();
}
text.layers.add(l);
}
File metaFile = new File(metaPath);
if(metaFile.exists()) {
BufferedReader br2 = new BufferedReader(new FileReader(metaFile));
String line2 = br2.readLine();
while(line2 != null) {
if(!line2.startsWith("#")) {
String[] ss = line2.split("\\t");
if(ss.length != 4 && ss.length != 5) {
System.out.println("line2 = " + line2);
System.out.println(Arrays.asList(ss));
throw new RuntimeException("ss.length != 5");
}
String v = ss.length == 4? "": ss[4];
String k = ss[3].substring(5);
if(k.matches("region|subregion|country|topic")) {
ArrayList l = (ArrayList) text.getProperty(k);
if(l == null) {
l = new ArrayList();
text.setProperty(k, l);
}
l.add(v);
} else {
String v2 = (String) text.getProperty(k);
if(v2 != null)
throw new RuntimeException("property " + k + " already set");
text.setProperty(k, v);
}
}
line2 = br2.readLine();
}
}
//System.out.println("Text done:");
//System.out.println(text);
//RawXMLAnnotation.instance().printLayers(System.out, text);
tokenize(text);
//text.findTokenIndices();
text.splitLayers();
//ColumnSpanAnnotation.instance().printLayers(pw, text);
//System.exit(0);
//System.out.println("Layers:");
for(AnnotationLayer l: text.layers) {
System.out.println(l.label);
List ls = Collections.singletonList(l);
try {
RawXMLAnnotation.instance().printLayers(text.text, ls);
} catch(Exception e) {
System.out.println("*** Exception: " + e);
System.out.println(l);
}
}
//System.exit(0);
//for(String t: text.tokens) {
// System.out.println(t);
//}
}
}
} catch(Exception e) {
e.printStackTrace();
}
}
private static String escapeTags(String text) {
StringBuffer sb = new StringBuffer();
Matcher m = TAG_PAT.matcher(text);
while(m.find()) {
String repl = m.group();
repl = repl.replaceAll(".", "~");
m.appendReplacement(sb, repl);
}
m.appendTail(sb);
return sb.toString();
}
private static void tokenize(AnnotatedText text) {
//System.out.println(text.properties);
//System.out.println(text.text);
String oldText = text.text;
text.text = escapeTags(text.text);
if(false && text.text.contains("~")) {
System.out.println(oldText);
System.out.println("***");
System.out.println("raw text = " + text.text);
System.out.println("---");
System.exit(0);
}
for(Span sen: (AnnotationLayer) text.layers.get(0)) {
if(!sen.label.equals("GATE_sentence"))
throw new RuntimeException("not sentence: " + sen);
int start = sen.start;
int end = sen.end;
if(start > 0
&& Character.isLetter(text.text.charAt(start-1))
&& Character.isLetter(text.text.charAt(start))) {
//if(text.getProperty("mpqa_file").equals("database.mpqa.2.0/docs/xbank/wsj_0583"))
// break;
//if(text.getProperty("mpqa_file").equals("database.mpqa.2.0/docs/20020203/20.46.36-9539"))
// break;
System.err.println("*** WARNING: " + text.getProperty("mpqa_file") + " problematic");
String ss = text.text.substring(start, end);
System.out.println("This sentence: |" + ss + "|");
try {
RawXMLAnnotation.instance().printLayers(System.out, text.text, Collections.singletonList(text.layers.get(0)));
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
System.exit(0);
break;
}
}
/*if(text.getProperty("mpqa_file").equals("database.mpqa.2.0/docs/xbank/wsj_0583")) {
text.splitLayers();
//RawXMLAnnotation.instance().printLayers(text);
try {
RawXMLAnnotation.instance().printLayers(System.out, text.text, Collections.singletonList(text.layers.get(0)));
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
try {
RawXMLAnnotation.instance().printLayers(System.out, text.text, Collections.singletonList(text.layers.get(1)));
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
System.exit(0);
}*/
AnnotationLayer senLayer = text.layers.get(0);
Collections.sort(senLayer.spans, Span.NestingOrder.instance());
ArrayList out = new ArrayList();
ArrayList ss = new ArrayList();
Span prevSen = null;
for(Span sen: (AnnotationLayer) senLayer) {
if(prevSen != null) {
if(sen.start < prevSen.end) {
System.out.println(senLayer);
System.out.println(sen.start + ", " + prevSen.end);
for(Span sen2: (AnnotationLayer) senLayer) {
System.out.println(sen2.start + ", " + sen2.end + " |" + text.text.substring(sen2.start, sen2.end) + "|");
}
throw new RuntimeException("Sentences not ordered...");
}
}
String senStr = text.text.substring(sen.start, sen.end);
//System.out.println("senStr = |" + senStr + "|");
String[] ts = tokenizeSentence(senStr);
for(String t: ts)
out.add(t);
int position = getNext(text.text, sen.start); //sen.start;
for(int i = 0; i < ts.length; i++) {
int[] p = new int[2];
p[0] = position;
p[1] = getEnd(text.text, ts[i], position);
position = getNext(text.text, p[1]);
ss.add(p);
}
prevSen = sen;
}
text.tokens = out.toArray(new String[0]);
int[] starts = new int[text.tokens.length];
int[] ends = new int[text.tokens.length];
for(int i = 0; i < text.tokens.length; i++) {
starts[i] = ss.get(i)[0];
ends[i] = ss.get(i)[1];
}
text.setProperty("token-starts", starts);
text.setProperty("token-ends", ends);
/*
System.out.println("starts = " + Arrays.toString(starts));
System.out.println("ends = " + Arrays.toString(ends));
System.out.println("starts.length = " + starts.length);
*/
/*int position = senLayer.spans.get(0).start;
for(int i = 0; i < text.tokens.length; i++) {
starts[i] = position;
ends[i] = getEnd(text.text, text.tokens[i], position);
position = getNext(text.text, ends[i]);
}*/
if(text.getProperty("mpqa_file").equals("database.mpqa.2.0/docs/ula/AFGP-2002-600045-Trans")) {
for(int i = 0; i < text.tokens.length; i++) {
String t = text.text.substring(starts[i], ends[i]);
warn(i + "\t|" + text.tokens[i] + "|\t|" + t + "|" + "\t" + starts[i] + "\t" + ends[i]);
}
}
for(AnnotationLayer l: text.layers)
for(Span s: l) {
int st = s.start;
while(st < s.end && st < text.text.length() && Character.isWhitespace(text.text.charAt(st)))
st++;
int en = s.end;
while(en > st && en < text.text.length() && Character.isWhitespace(text.text.charAt(en - 1)))
en--;
if(false && text.getProperty("mpqa_file").equals("database.mpqa.2.0/docs/ula/AFGP-2002-600045-Trans")) {
if(st != s.start || en != s.end) {
System.err.println("Modified span: (" + s.label + ")");
String t0 = text.text.substring(s.start, s.end);
String t1 = text.text.substring(st, en);
System.err.println("|" + t0 + "|");
System.err.println("|" + t1 + "|");
System.err.println("st = " + st + ", en = " + en);
}
}
boolean debug = false;
/*if(text.getProperty("mpqa_file").equals("database.mpqa.2.0/docs/ula/AFGP-2002-600045-Trans"))
if(s.label.equals("GATE_sentence") && s.start == 3805)
debug = true;*/
s.tokenStart = findIndexBSearch(starts, st, debug);
// TODO probably a bug using "debug" here!
//s.tokenEnd = findIndexBSearch(ends, s.end) + 1;
if(st < en && s.tokenStart < starts.length) {
s.tokenEnd = findIndexBSearch(ends, en, debug) + 1;
if(s.tokenEnd > text.tokens.length)
s.tokenEnd = text.tokens.length;
if(s.tokenEnd < s.tokenStart) {
//System.out.println("*** HERE ***: " + s);
s.tokenEnd = text.tokens.length;
}
}
else {
s.tokenEnd = s.tokenStart;
}
}
String tmp = text.text;
text.text = oldText;
boolean saw = false;
for(int i = 0; i < text.tokens.length; i++) {
if(text.tokens[i].startsWith("~~")) {
text.tokens[i] = text.text.substring(starts[i], ends[i]);
warn("Replaced: |" + text.tokens[i] + "|");
saw = true;
}
}
if(false && saw) {
System.out.println("*** CHECK ***");
System.out.println("|" + tmp + "|");
System.out.println("|" + text.text + "|");
for(int i = 0; i < text.tokens.length; i++) {
String t = text.text.substring(starts[i], ends[i]);
System.out.printf("%d\t%s\t%s\n", i, text.tokens[i], t);
}
System.exit(0);
}
/*for(int i = 0; i < text.tokens.length; i++) {
System.out.printf("%d\t%s\n", i, text.tokens[i]);
}*/
/*
for(AnnotationLayer l: text.layers) {
System.out.println("Layer: " + l.label);
for(Span s: l) {
System.out.print(s.label + ": (" + s.tokenStart + ", " + s.tokenEnd + ")");
System.out.print(" (" + s.start + ", " + s.end + ")");
String subs;
try {
subs = text.text.substring(s.start, s.end);
} catch(Exception e) {
subs = "";
}
System.out.println(" |" + subs + "|");
}
}*/
/*for(int i = 0; i < text.tokens.length; i++) {
System.out.printf("%d\t%s %d %d\n", i, text.tokens[i], starts[i], ends[i]);
}*/
}
private static void checkSpan(Span s, AnnotatedText text) {
if(s.label.equals("GATE_split"))
return;
/*if(s.tokenStart == -1) {
System.err.println(s);
System.err.println("|" + text.text.substring(s.start, s.end) + "|");
System.err.println(Arrays.toString((int[]) text.getProperty("token-starts")));
System.err.println(Arrays.toString((int[]) text.getProperty("token-ends")));
}
if(s.tokenEnd == -1) {
System.err.println(s);
}*/
String s1 = text.text.substring(s.start, s.end);
s1 = s1.replaceAll("\\s+", "");
StringBuilder sb2 = new StringBuilder();
for(int i = s.tokenStart; i < s.tokenEnd && i < text.tokens.length; i++)
sb2.append(text.tokens[i]);
String s2 = sb2.toString();
s2 = s2.replaceAll("``", "\"");
s2 = s2.replaceAll("''", "\"");
if(!s1.equals(s2)) {
System.err.println("|" + s1 + "|");
System.err.println("|" + s2 + "|");
throw new RuntimeException("!!");
}
}
private static int findIndexBSearch(int[] arr, int c, boolean isStart) {
if(arr.length == 0)
return -1;
if(c < arr[0]) {
if(isStart)
return -1;
else
return 0;
}
if(c > arr[arr.length - 1]) {
if(isStart)
return arr.length - 1;
else
return -1;
}
//return arr.length - 1; // ???
int low = 0, high = arr.length - 1, mid = (low + high) / 2;
while(low < mid && mid < high) {
if(arr[mid] == c)
return mid;
if(arr[low] == c)
return low;
if(arr[high] == c)
return high;
if(c < arr[mid])
high = mid;
else
low = mid;
mid = (low + high) / 2;
}
//System.out.println("2. low = " + low + ", mid = " + mid + ", high = " + high);
if(low < high) {
if(arr[low] == c)
return low;
if(arr[high] == c)
return high;
// TODO testa
if(isStart)
return low;
else
return high;
}
return mid;
}
private static void testBSearch() {
int[] starts = {0, 4, 8};
int[] ends = {3, 7, 11};
Object[][] tests = {
{ -1, -1, true },
{ 0, 0, true },
{ 4, 1, true },
{ 8, 2, true },
{ 1, 0, true },
{ 2, 0, true },
{ 3, 0, true }, // !! gränsfall! vad vill jag ha?
{ 5, 1, true },
{ 6, 1, true },
{ 7, 1, true }, // !!
{ 9, 2, true },
{ 10, 2, true },
{ 11, 2, true }, // !!
{ 12, 2, true }, // !??
{ 3, 0, false },
{ 7, 1, false },
{ 11, 2, false },
{ 0, 0, false }, // !??
{ 1, 0, false },
{ 2, 0, false },
{ 4, 1, false }, // !!
{ 5, 1, false },
{ 6, 1, false },
{ 8, 2, false }, // !!
{ 9, 2, false },
{ 10, 2, false },
{ 12, -1, false }, // !??
};
for(int i = 0; i < tests.length; i++) {
int charIndex = (Integer) tests[i][0];
int tokenIndex = (Integer) tests[i][1];
boolean isStart = (Boolean) tests[i][2];
int[] arr = isStart? starts: ends;
int ix = findIndexBSearch(arr, charIndex, isStart);
if(ix != tokenIndex)
throw new RuntimeException("error " + i + ": ix = " + ix);
}
starts = new int[] { 0, 4 };
ends = new int[] { 3, 7 };
tests = new Object[][] {
{ -1, -1, true },
{ 0, 0, true },
{ 4, 1, true },
{ 1, 0, true },
{ 2, 0, true },
{ 3, 0, true }, // !! gränsfall! vad vill jag ha?
{ 5, 1, true },
{ 6, 1, true },
{ 7, 1, true }, // !!
{ 3, 0, false },
{ 7, 1, false },
{ 0, 0, false }, // !??
{ 1, 0, false },
{ 2, 0, false },
{ 4, 1, false }, // !!
{ 5, 1, false },
{ 6, 1, false },
{ 8, -1, false }, // !??
};
for(int i = 0; i < tests.length; i++) {
int charIndex = (Integer) tests[i][0];
int tokenIndex = (Integer) tests[i][1];
boolean isStart = (Boolean) tests[i][2];
int[] arr = isStart? starts: ends;
int ix = findIndexBSearch(arr, charIndex, isStart);
if(ix != tokenIndex)
throw new RuntimeException("error " + i + ": ix = " + ix);
}
}
private static String[] tokenizeSentence(String s) {
s = s.replaceAll("[\n\t\r]", " ");
//System.out.println("s = |" + s + "|");
if(s.trim().equals(""))
return new String[0];
// RJ
s = s.replaceAll("(~(~+))", " $1 ");
//# attempt to get correct directional quotes
// s/^"/`` /g;
s = s.replaceAll("^\"", "`` ");
//s/([ \([{<])"/$1 `` /g;
// # close quotes handled at end
s = s.replaceAll("([ \\(\\[{<])\"", "$1 `` ");
//s/\.\.\./ ... /g;
s = s.replaceAll("\\.\\.\\.", " ... ");
//s/[,;:@#$%&]/ $& /g;
// RJ changed, removed comma
s = s.replaceAll("[;:@#$%&]", " $0 ");
// RJ
s = s.replaceAll("([^ ]), ", "$1 , ");
s = s.replaceAll("([^0-9 ]),([^0-9 ])", "$1 , $2");
//# Assume sentence tokenization has been done first, so split FINAL periods
//# only.
//s/([^.])([.])([\])}>"']*)[ \t]*$/$1 $2$3 /g;
s = s.replaceAll("([^.])([.])([\\])}>\"']*)[ \\t]*$", "$1 $2$3");
//# however, we may as well split ALL question marks and exclamation points,
//# since they shouldn't have the abbrev.-marker ambiguity problem
//s/[?!]/ $& /g;
s = s.replaceAll("[?!]", " $0 ");
//# parentheses, brackets, etc.
//s/[\]\[\(\){}\<\>]/ $& /g;
s = s.replaceAll("[\\]\\[\\(\\){}\\<\\>]", " $0 ");
//# Some taggers, such as Adwait Ratnaparkhi's MXPOST, use the parsed-file
//# version of these symbols.
//# UNCOMMENT THE FOLLOWING 6 LINES if you're using MXPOST.
//s/\(/-LRB-/g;
//s/\)/-RRB-/g;
//s/\[/-LSB-/g;
//s/\]/-RSB-/g;
//s/{/-LCB-/g;
//s/}/-RCB-/g;
/*
s = s.replaceAll("\\(", "-LRB-");
s = s.replaceAll("\\(", "-RRB-");
s = s.replaceAll("\\[", "-LSB-");
s = s.replaceAll("\\]", "-RSB-");
s = s.replaceAll("\\{", "-LCB-");
s = s.replaceAll("\\}", "-RCB-");
*/
//s/--/ -- /g;
s = s.replaceAll("--", " -- ");
//# NOTE THAT SPLIT WORDS ARE NOT MARKED. Obviously this isn't great, since
//# you might someday want to know how the words originally fit together --
//# but it's too late to make a better system now, given the millions of
//# words we've already done "wrong".
//# First off, add a space to the beginning and end of each line, to reduce
//# necessary number of regexps.
s = " " + s + " ";
//s/"/ '' /g;
s = s.replaceAll("\"", " '' ");
// RJ från MPQA
s = s.replaceAll("([^ `])`([^ `])", "$1'$2");
s = s.replaceAll("`([^ `])", "` $1");
s = s.replaceAll("([^ ])''", "$1 ''");
//RJ
s = s.replaceAll(" '([^ '])", " ` $1");
s = s.replaceAll(" ` s ", " 's ");
//# possessive or close-single-quote
//s/([^'])' /$1 ' /g;
s = s.replaceAll("([^'])' ", "$1 ' ");
//# as in it's, I'm, we'd
//s/'([sSmMdD]) / '$1 /g;
//s/'ll / 'll /g;
//s/'re / 're /g;
//s/'ve / 've /g;
//s/n't / n't /g;
//s/'LL / 'LL /g;
//s/'RE / 'RE /g;
//s/'VE / 'VE /g;
//s/N'T / N'T /g;
s = s.replaceAll("'([sSmMdD]) ", " '$1 ");
s = s.replaceAll("'ll ", " 'll ");
s = s.replaceAll("'re ", " 're ");
s = s.replaceAll("'ve ", " 've ");
s = s.replaceAll("n't ", " n't ");
s = s.replaceAll("'LL ", " 'LL ");
s = s.replaceAll("'RE ", " 'RE ");
s = s.replaceAll("'VE ", " 'VE ");
s = s.replaceAll("N'T ", " N'T ");
//s/ ([Cc])annot / $1an not /g;
//s/ ([Dd])'ye / $1' ye /g;
//s/ ([Gg])imme / $1im me /g;
//s/ ([Gg])onna / $1on na /g;
//s/ ([Gg])otta / $1ot ta /g;
//s/ ([Ll])emme / $1em me /g;
//s/ ([Mm])ore'n / $1ore 'n /g;
//s/ '([Tt])is / '$1 is /g;
//s/ '([Tt])was / '$1 was /g;
//s/ ([Ww])anna / $1an na /g;
//# s/ ([Ww])haddya / $1ha dd ya /g;
//# s/ ([Ww])hatcha / $1ha t cha /g;
s = s.replaceAll(" ([Cc])annot ", " $1an not ");
s = s.replaceAll(" ([Dd])'ye ", " $1' ye ");
s = s.replaceAll(" ([Gg])imme ", " $1im me ");
s = s.replaceAll(" ([Gg])onna ", " $1on na ");
s = s.replaceAll(" ([Gg])otta ", " $1ot ta ");
s = s.replaceAll(" ([Ll])emme ", " $1em me ");
s = s.replaceAll(" ([Mm])ore'n ", " $1ore 'n ");
s = s.replaceAll(" '([Tt])is ", " '$1 is ");
s = s.replaceAll(" '([Tt])was ", " '$1 was ");
s = s.replaceAll(" ([Ww])anna ", " $1an na ");
s = s.replaceAll("", "");
// # clean out extra spaces
//s/ */ /g;
//s/^ *//g;
s = s.replaceAll(" *", " ");
s = s.trim();
//System.out.println("|" + s + "|");
//System.out.flush();
return s.split(" ");
}
private static int getNext(String lkText, int position) {
while(position < lkText.length() && Character.isWhitespace(lkText.charAt(position)))
position++;
return position;
}
private static int getEnd(String lkText, String token, int position) {
//String line) {
int len = Math.min(lkText.length() - position, token.length());
//System.out.println("position = " + position);
//System.out.println("len = " + len);
String t2 = lkText.substring(position, position + len);
if(!token.equals(t2)) {
if(token.matches("``|''") && t2.startsWith("\"")) {
len = 1;
t2 = "\"";
} else if(token.equals("`") && t2.startsWith("'")) {
len = 1;
t2 = "'";
} else if(token.replaceAll("`", "'").equals(t2.replaceAll("`", "'"))) {
len = token.length();
} else {
//System.out.println(line);
System.out.println(getContext(lkText, position));
System.out.println("position = " + position);
System.out.println("ss = " + lkText.substring(position));
throw new RuntimeException(token + " != " + t2);
}
}
//System.out.println(token);
position += len;
//while(position < lkText.length() && Character.isWhitespace(lkText.charAt(position)))
// position++;
//System.out.println("returning " + position);
return position;
}
private static String getContext(String text, int pos) {
int start = Math.max(0, pos - 100);
int end = Math.min(text.length(), pos + 100);
return text.substring(start, end);
}
public static void main1(String[] argv) {
Iterator i = processDirectory(argv[0]);
while(i.hasNext()) {
AnnotatedText t = i.next();
System.out.println(t.getProperty("mpqa_file"));
for(int ii = 0; ii < t.tokens.length; ii++) {
System.out.printf("%d\t%s\n", ii, t.tokens[ii]);
}
}
}
public static void main3(String[] argv) {
String s = Strings.join(argv, " ");
System.out.println("|" + s + "|");
String[] ts = tokenizeSentence(s);
for(String t: ts)
System.out.println(t);
}
public static void main4(String[] argv) {
Iterator i = processDirectory(argv[0]);
int count = 0;
int countSentences = 0;
while(i.hasNext()) {
count++;
AnnotatedText text = i.next();
for(AnnotationLayer l: text.layers)
for(Span s: l)
if(s.label.equals("GATE_sentence"))
countSentences++;
}
System.out.println(count);
System.out.println(countSentences);
}
public static void main5(String[] argv) {
Iterator i = processDirectory(argv[0]);
ArrayList l = new ArrayList();
int count = 0;
while(i.hasNext()) {
count++;
AnnotatedText text = i.next();
String f = (String) text.getProperty("mpqa_file");
f = f.replaceAll("database.mpqa.2.0/docs/", "");
l.add(f);
}
Random rand = new Random(0);
Collections.shuffle(l, rand);
final int N = 5;
ArrayList[] ls = new ArrayList[N];
for(int ii = 0; ii < N; ii++)
ls[ii] = new ArrayList();
int ix = 0;
while(!l.isEmpty()) {
String f = l.remove(l.size() - 1);
ls[ix].add(f);
ix = (ix + 1) % N;
}
for(int ii = 0; ii < N; ii++) {
String outfile = "folds/fold" + ii;
try {
PrintWriter pw = new PrintWriter(new FileWriter(outfile));
Collections.sort(ls[ii]);
for(String f: ls[ii])
pw.println(f);
pw.close();
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
}
}
public static void main6(String[] argv) {
Iterator i = processDirectory(argv[0]);
while(i.hasNext()) {
AnnotatedText t = i.next();
if(t.getProperty("mpqa_file").toString().contains("12.55.04-23296")) {
Util.printFileBytes((String) t.getProperty("mpqa_file"));
System.out.println("---");
for(int ii = 0; ii < t.text.length(); ii++) {
char c = t.text.charAt(ii);
String cs = c < ' '? ">": ("" + c);
System.out.format("%d\t%s\n", ii, cs);
}
System.out.println("---");
for(AnnotationLayer l: t.layers) {
System.out.println("Layer: " + l.label);
for(Span s: l) {
String ps = s.properties == null? "(null)": s.properties.toString();
String ss = s.start <= s.end? t.text.substring(s.start, s.end): "";
System.out.format("%d %d %d %d %s %s %s\n", s.start, s.end,
s.tokenStart, s.tokenEnd, "|" + ss + "|", s.label, ps);
}
}
break;
}
}
}
public static void main7(String[] argv) {
try {
Iterator i = processDirectory(argv[0]);
//PrintWriter pw = new PrintWriter(new FileWriter("mpqa_wsj.txt"));
//System.out.println("Processing files.");
while(i.hasNext()) {
AnnotatedText t = i.next();
//System.out.println(t.getProperty("mpqa_file"));
String filename = (String)t.getProperty("mpqa_file");
filename = filename.replaceAll(".*/", "");
if(!filename.contains("wsj"))
continue;
AnnotationLayer l = new AnnotationLayer();
AnnotationLayer l0 = t.layers.get(1);
for(Span s: l0.spans) {
if(s.label.matches("GATE_(objective-speech-event|direct-subjective|expressive-subjectivity)") && s.start < s.end)
l.spans.add(s);
}
ArrayList ls = new ArrayList();
ls.add(l);
System.out.println("");
RawXMLAnnotation.instance().printLayers(System.out, t.text,
ls);
}
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
}
public static void main(String[] argv) {
//main6(argv);
main7(argv);
//testBSearch();
}
}