package lkformat2;

import java.io.*;
import java.util.*;

public class PreprocessParser {

	private static String extractAttribute(String line, String attr) {
		String s = attr + "=\"";
		int ix1 = line.indexOf(s);
		if(ix1 == -1)
			return null;
		ix1 += s.length();
		int ix2 = line.indexOf("\"", ix1);
		return line.substring(ix1, ix2);
	}

	private static String extractEntityData(String line) {
		int ix1 = line.indexOf("<e");
		if(ix1 == -1)
			return null;
		int ix2 = line.indexOf(">", ix1);
		if(ix2 == -1)
			return null;
		int ix3 = line.lastIndexOf("</e>");
		if(ix3 == -1)
			return null;
		return line.substring(ix2 + 1, ix3);
	}
	
	public static void processFile(String fileName, PrintWriter out) {
		try {
			if(new File(fileName).isDirectory())
				return;

			BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8"));

			String line = br.readLine();
			while(line != null) {
				if(line.contains("provides=\"SENTENCES\""))
					break;				
				line = br.readLine();
			}

			if(line == null)
				return;

			String tokenFile = extractAttribute(line, "scope");
			if(tokenFile == null)
				tokenFile = fileName;

			//System.out.println("Sentences from " + fileName + ", tokens from " + tokenFile);
			
			ArrayList<String[]> spans = new ArrayList(); 
			
			line = br.readLine();
			
			while(!line.contains("</annotation>")) {
				String start = extractAttribute(line, "start");
				String end = extractAttribute(line, "end");

				//System.out.println("line = " + line + " start = " + start + " end = " + end);
				
				if(start == null)
					throw new RuntimeException("Only start-end annotation supported for sentences");
				if(end == null)
					throw new RuntimeException("Only start-end annotation supported for sentences");

				if(start.charAt(0) != '#')
					throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " start = " + start + " end = " + end);
				if(end.charAt(0) != '#')
					throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " start = " + start + " end = " + end);

				start = start.substring(1);
				end = end.substring(1);
				
				spans.add(new String[] { start, end });
				
				line = br.readLine();				
			}

			if(spans.isEmpty())
				return;
			
			br.close();
			br = new BufferedReader(new InputStreamReader(new FileInputStream(tokenFile), "UTF-8"));

			line = br.readLine();
			while(line != null) {
				if(line.contains("provides=\"TOKENS\""))
					break;				
				line = br.readLine();
			}

			int senPos = 0;
			String[] senSpan = spans.get(senPos);
			boolean inside = false;

			int prev = 0;
			
			ArrayList<String[]> tokens = new ArrayList();
			
			line = br.readLine();
			while(!line.contains("</annotation>")) {				
				line = line.trim();				
				if(!line.equals("")) {					
					String t = extractEntityData(line);
					if(t == null)
						throw new RuntimeException("Could not extract token");
					String id = extractAttribute(line, "id");
					if(id == null)
						throw new RuntimeException("Could not extract id");

					int idi = Integer.parseInt(id);
					if(idi != prev + 1)
						throw new RuntimeException("I have assumed contiguous ids..."); 
					prev = idi;
					
					if(id.equals(senSpan[0]))
						inside = true;
					
					if(inside) {
						//out.println(t);
						String[] ts = new String[4];
						ts[0] = id;
						ts[1] = t;
						tokens.add(ts);
					}
					
					if(id.equals(senSpan[1])) {
						senPos++;
						if(senPos == spans.size())
							break;
						senSpan = spans.get(senPos);
						inside = false;
					}
					
				}
				line = br.readLine();
			}

			String posFile = tokenFile.replaceAll("\\.[^\\.]+\\.xml", ".pos.xml");			
			// TEMPORARY
			//posFile = posFile.replaceFirst("solr-lkxml", "lk_output_new");
			br.close();
			br = new BufferedReader(new InputStreamReader(new FileInputStream(posFile), "UTF-8"));
			
			//out.println("___BEGIN___|" + tokenFile);
			//out.println();			
						
    		out.print("0\t___BEGIN___|" + tokenFile + "\t");
            out.print("_\t");
            out.print("_\t");
            out.print("_\t");
            out.print("_\t");
            out.print("___BEGIN___|" + tokenFile + "\t");
            out.print("_\t");
            out.print("0\t");
            out.println("ROOT");
            out.println();
            
			line = br.readLine();
			while(line != null) {
				if(line.contains("provides=\"POS\""))
					break;
				line = br.readLine();
			}
			if(line == null)
				throw new RuntimeException("No POS annotation found!");
			
			line = br.readLine();
			while(!line.contains("</annotation>")) {				
				line = line.trim();				
				if(!line.equals("")) {					
					String t = extractEntityData(line);
					if(t == null)
						throw new RuntimeException("Could not extract token");

					String on = extractAttribute(line, "on");
					if(on == null)
						throw new RuntimeException("Only on annotation supported for sentences");
					if(on.charAt(0) != '#')
						throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " on = " + on);
					
					on = on.substring(1);					
					int position = Integer.parseInt(on) - 1;				
					String[] ts = tokens.get(position);					
					if(!on.equals(ts[0]))
						throw new RuntimeException("!!!");
					ts[2] = t;
				}
				line = br.readLine();
			}

			while(line != null) {
				if(line.contains("provides=\"LEMMA\""))
					break;				
				line = br.readLine();
			}

			line = br.readLine();
			while(!line.contains("</annotation>")) {				
				line = line.trim();		
				if(!line.equals("")) {
					String t = extractEntityData(line);
					if(t == null)
						throw new RuntimeException("Could not extract token");

					String on = extractAttribute(line, "on");
					if(on == null)
						throw new RuntimeException("Only on annotation supported for sentences");
					if(on.charAt(0) != '#')
						throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " on = " + on);

					on = on.substring(1);					
					int position = Integer.parseInt(on) - 1;				
					String[] ts = tokens.get(position);					
					if(!on.equals(ts[0]))
						throw new RuntimeException("!!!");
					ts[3] = t;
				}
				line = br.readLine();
			}
			br.close();
			
			senPos = 0;
			senSpan = spans.get(senPos);
			int posInSentence = 0;
			
			for(String[] ts: tokens) {
				
				//out.println(Arrays.toString(ts));

				posInSentence++;

				out.print(posInSentence + "\t");
				out.print(ts[1] + "\t");
                out.print(ts[3] + "\t");
                out.print("_\t");
                out.print(ts[2] + "\t");
                out.print(ts[1] + "\t");
                out.print(ts[3] + "\t");
                out.print(ts[2] + "\t");
                out.print("0\t");
                out.println("ROOT");
				
				if(ts[0].endsWith(senSpan[1])) {
					out.println();
					senPos++;
					if(senPos == spans.size())
						break;
					senSpan = spans.get(senPos);
					posInSentence = 0;
				}
			}
			
			out.print("1\t___END___\t");
            out.print("_\t");
            out.print("_\t");
            out.print("_\t");
            out.print("___END___\t");
            out.print("_\t");
            out.print("_\t");
            out.print("0\t");
            out.println("ROOT");
    		out.println();
			
    		//out.println("___END___|" + tokenFile);
			//out.println();			
			
		} catch(Exception e) {
			e.printStackTrace();
			System.exit(1);
		}
	}
	
	public static void processDirectory(String dirName, String outFileName) {
		try {
			PrintWriter out = new PrintWriter(new FileWriter(outFileName));			
			String[] files = new File(dirName).list();
			Arrays.sort(files);			
			for(String file: files) {
				processFile(dirName + File.separatorChar + file, out);
			}
			out.close();
		} catch(Exception e) {
			e.printStackTrace();
			System.exit(1);
		}
	}
	
	public static void main(String[] argv) {
		processDirectory(argv[0], argv[1]);		
	}
	
}