package lkformat2;

import java.io.*;
import java.util.*;

public class PreprocessSSTLight {

	private static String extractAttribute(String line, String attr) {
		String s = attr + "=\"";
		int ix1 = line.indexOf(s);
		if(ix1 == -1)
			return null;
		ix1 += s.length();
		int ix2 = line.indexOf("\"", ix1);
		return line.substring(ix1, ix2);
	}

	private static String extractEntityData(String line) {
		int ix1 = line.indexOf("<e");
		if(ix1 == -1)
			return null;
		int ix2 = line.indexOf(">", ix1);
		if(ix2 == -1)
			return null;
		int ix3 = line.lastIndexOf("</e>");
		if(ix3 == -1)
			return null;
		return line.substring(ix2 + 1, ix3);
	}
	
	public static void processFile(String fileName, PrintWriter out) {
		try {
			if(new File(fileName).isDirectory())
				return;

			BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8"));

			String line = br.readLine();
			while(line != null) {
				if(line.contains("provides=\"SENTENCES\""))
					break;				
				line = br.readLine();
			}

			if(line == null) {
				br.close();
				return;
			}

			String tokenFile = extractAttribute(line, "scope");
			if(tokenFile == null)
				tokenFile = fileName;

			System.out.println("Sentences from " + fileName + ", tokens from " + tokenFile);
			
			ArrayList<String[]> spans = new ArrayList(); 
			
			line = br.readLine();
			
			while(!line.contains("</annotation>")) {
				String start = extractAttribute(line, "start");
				String end = extractAttribute(line, "end");

				//System.out.println("line = " + line + " start = " + start + " end = " + end);
				
				if(start == null)
					throw new RuntimeException("Only start-end annotation supported for sentences");
				if(end == null)
					throw new RuntimeException("Only start-end annotation supported for sentences");

				if(start.charAt(0) != '#')
					throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " start = " + start + " end = " + end);
				if(end.charAt(0) != '#')
					throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " start = " + start + " end = " + end);

				start = start.substring(1);
				end = end.substring(1);
				
				spans.add(new String[] { start, end });
				
				line = br.readLine();				
			}

			/* 091012 if(spans.isEmpty())
				return; */
			
			br.close();
			
			br = new BufferedReader(new InputStreamReader(new FileInputStream(tokenFile), "UTF-8"));

			out.println("___BEGIN___|" + tokenFile);
			out.println();			

			out.flush();
			
			line = br.readLine();
			while(line != null) {
				if(line.contains("provides=\"TOKENS\""))
					break;				
				line = br.readLine();
			}

			int senPos = 0;

			if(senPos < spans.size()) {
				String[] senSpan = spans.get(senPos);
				boolean inside = false;

				int prev = 0;

				line = br.readLine();
				while(!line.contains("</annotation>")) {				
					line = line.trim();				
					if(!line.equals("")) {					
						String t = extractEntityData(line);
						if(t == null)
							throw new RuntimeException("Could not extract token");
						String id = extractAttribute(line, "id");
						if(id == null)
							throw new RuntimeException("Could not extract id");

						int idi = Integer.parseInt(id);
						if(idi != prev + 1)
							throw new RuntimeException("I have assumed contiguous ids..."); 
						prev = idi;

						if(id.equals(senSpan[0]))
							inside = true;

						if(inside)
							out.println(t);

						if(id.equals(senSpan[1])) {
							out.println();
							senPos++;
							if(senPos == spans.size())
								break;
							senSpan = spans.get(senPos);
							inside = false;
						}

					}
					line = br.readLine();
				}
			}

			br.close();
			
			out.println("___END___|" + tokenFile);
			out.println();			
			
			out.flush();
			
		} catch(Exception e) {
			e.printStackTrace();
			System.exit(1);
		}
	}
	
	public static void processDirectory(String dirName, String outFileName) {
		try {
			PrintWriter out = new PrintWriter(new FileWriter(outFileName));			
			String[] files = new File(dirName).list();
			Arrays.sort(files);			
			for(String file: files) {
				processFile(dirName + File.separatorChar + file, out);
			}
			out.close();
		} catch(Exception e) {
			e.printStackTrace();
			System.exit(1);
		}
	}
	
	public static void main(String[] argv) {
		processDirectory(argv[0], argv[1]);		
	}
	
}