package lkformat2;
import java.io.*;
import java.util.*;
public class PreprocessSSTLight {
private static String extractAttribute(String line, String attr) {
String s = attr + "=\"";
int ix1 = line.indexOf(s);
if(ix1 == -1)
return null;
ix1 += s.length();
int ix2 = line.indexOf("\"", ix1);
return line.substring(ix1, ix2);
}
private static String extractEntityData(String line) {
int ix1 = line.indexOf("", ix1);
if(ix2 == -1)
return null;
int ix3 = line.lastIndexOf("");
if(ix3 == -1)
return null;
return line.substring(ix2 + 1, ix3);
}
public static void processFile(String fileName, PrintWriter out) {
try {
if(new File(fileName).isDirectory())
return;
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8"));
String line = br.readLine();
while(line != null) {
if(line.contains("provides=\"SENTENCES\""))
break;
line = br.readLine();
}
if(line == null) {
br.close();
return;
}
String tokenFile = extractAttribute(line, "scope");
if(tokenFile == null)
tokenFile = fileName;
System.out.println("Sentences from " + fileName + ", tokens from " + tokenFile);
ArrayList spans = new ArrayList();
line = br.readLine();
while(!line.contains("")) {
String start = extractAttribute(line, "start");
String end = extractAttribute(line, "end");
//System.out.println("line = " + line + " start = " + start + " end = " + end);
if(start == null)
throw new RuntimeException("Only start-end annotation supported for sentences");
if(end == null)
throw new RuntimeException("Only start-end annotation supported for sentences");
if(start.charAt(0) != '#')
throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " start = " + start + " end = " + end);
if(end.charAt(0) != '#')
throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " start = " + start + " end = " + end);
start = start.substring(1);
end = end.substring(1);
spans.add(new String[] { start, end });
line = br.readLine();
}
/* 091012 if(spans.isEmpty())
return; */
br.close();
br = new BufferedReader(new InputStreamReader(new FileInputStream(tokenFile), "UTF-8"));
out.println("___BEGIN___|" + tokenFile);
out.println();
out.flush();
line = br.readLine();
while(line != null) {
if(line.contains("provides=\"TOKENS\""))
break;
line = br.readLine();
}
int senPos = 0;
if(senPos < spans.size()) {
String[] senSpan = spans.get(senPos);
boolean inside = false;
int prev = 0;
line = br.readLine();
while(!line.contains("")) {
line = line.trim();
if(!line.equals("")) {
String t = extractEntityData(line);
if(t == null)
throw new RuntimeException("Could not extract token");
String id = extractAttribute(line, "id");
if(id == null)
throw new RuntimeException("Could not extract id");
int idi = Integer.parseInt(id);
if(idi != prev + 1)
throw new RuntimeException("I have assumed contiguous ids...");
prev = idi;
if(id.equals(senSpan[0]))
inside = true;
if(inside)
out.println(t);
if(id.equals(senSpan[1])) {
out.println();
senPos++;
if(senPos == spans.size())
break;
senSpan = spans.get(senPos);
inside = false;
}
}
line = br.readLine();
}
}
br.close();
out.println("___END___|" + tokenFile);
out.println();
out.flush();
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
}
public static void processDirectory(String dirName, String outFileName) {
try {
PrintWriter out = new PrintWriter(new FileWriter(outFileName));
String[] files = new File(dirName).list();
Arrays.sort(files);
for(String file: files) {
processFile(dirName + File.separatorChar + file, out);
}
out.close();
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
}
public static void main(String[] argv) {
processDirectory(argv[0], argv[1]);
}
}