#!/usr/bin/python
# -*- coding: utf-8 -*- 

    # Copyright (C) 2010–2015 Agnieszka Patejuk

    # This program is free software: you can redistribute it and/or modify
    # it under the terms of the GNU General Public License as published by
    # the Free Software Foundation, either version 3 of the License, or
    # (at your option) any later version.

    # This program is distributed in the hope that it will be useful,
    # but WITHOUT ANY WARRANTY; without even the implied warranty of
    # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    # GNU General Public License for more details.

    # You should have received a copy of the GNU General Public License
    # along with this program.  If not, see <http://www.gnu.org/licenses/>.

import sys
import os
from common2xle import *

treebank = sys.argv[1]
testsuite = sys.argv[2]
outpath = sys.argv[3]

good = open(testsuite+"-good", "w")
failed = open(testsuite+"-failed", "w")
segmentation = open(testsuite+"-good-segments", "w")

# zmodyfikowana mapa Michala Lenarta
old2newid = {'NKJP_1M_000500-GazetaPomorska': 'NKJP_1M_GazetaPomorska', 'NKJP_1M_1999': 'NKJP_1M_DP1999', 'NKJP_1M_2004': 'NKJP_1M_DP2004', 'NKJP_1M_2001': 'NKJP_1M_DP2001', 'NKJP_1M_1998': 'NKJP_1M_DP1998', 'NKJP_1M_001000-DziennikPolski1980': 'NKJP_1M_DP1980', 'NKJP_1M_2000': 'NKJP_1M_DP2000', 'NKJP_1M_TrybunaPLUSTrybunaLudu': 'NKJP_1M_TrybunaLuduPLUSTrybuna', 'NKJP_1M_012750-Rzeczpospolita': 'NKJP_1M_Rzeczpospolita', 'NKJP_1M_031875-SuperExpress': 'NKJP_1M_SuperExpress', 'NKJP_1M_010200-ZycieWarszawyPLUSZycie': 'NKJP_1M_ZycieWarszawyPLUSZycie', 'NKJP_1M_.': 'NKJP_1M_7123900001', 'NKJP_1M_2002': 'NKJP_1M_DP2002', 'NKJP_1M_2003': 'NKJP_1M_DP2003'}

def badinterps(dct):
	out = open(testsuite+"-disagrinterp", "w")
	for key in dct:
		out.write(sentFind(key)+"\n")
		out.write(key+"\n")
		for entry in dct[key]:
			out.write("\t".join(entry).encode("utf-8")+"\n")
		out.write("\n")

disagrinterp = {}

def findInputPaths(rootpath):
	inputFiles = []
	for root, dirs, files in os.walk(rootpath):
		for name in files:
			if re.match(".*.xml", name):
				inputFiles.append(os.path.join(root,name))
	return inputFiles

inputPaths = findInputPaths(treebank)

def idCheckNKJP(name, dct):
	if dct.has_key(name):
		return dct[name]
	else:
		return name

def dendrSplit(path):
	parts = path.split('/')
	sent = parts[-1][:-4]
	par = parts[-2]
	# replace old IDs with new ones, replace 'PLUS' with '_', remove '4scal-', remove 'NKJP_1M_'
	subdir = idCheckNKJP(parts[-3], old2newid).replace('PLUS','_').replace('4scal-', '')[8:]
	return [subdir, par, sent]

def sentFind(inpath):
    for line in open(inpath):
        if re.search("<text>.*</text>", line):
            return line[10:-8]

# faster version
def anyTrees(inpath):
    comments = []
    for line in open(inpath):
        line = line.strip()
        if re.search('<base-answer type="FULL" username=.*>', line):
	    return "OK"
	if re.search('<.*-answer type=.*>', line):
		if line.split()[2][10:-2] != "none":
			comments.append(line)
	if re.search('<comment>.+</comment>', line) and not re.search('<comment>AUTO</comment>', line):
		comments.append(line)
	if line == '</answer-data>':
		return comments

def findName(path):
    ps = path.split('/')
    return ps[-3] + '.' + ps[-2] + '.' + ps[-1][:-4]
#    return path.split('/')[-1][:-4]

def processInput(file, testfilename):
	inside_node = False
	inside_tok = False
	# necessary? good for finding problematic entries (if the interpretation chosen in Dendrarium is different than in NKJP)
	good_interp = False
	terminals = []
	dictform = []
	# only for segmentation
	unordered = []
	start = ""
	end = ""
	for line in open(file, 'r'):
		line = line.strip().decode("utf-8")
		if re.search('<node nid=".+" from=".+" to=".+" subtrees=".+" chosen="true">', line):
			inside_node = True
			parts = line.split()
			start = parts[2][6:-1]
			end = parts[3][4:-1]			
		if re.search("<terminal token_id=.*", line):
			inside_tok = True
		if re.search("</terminal>", line):
			inside_tok = False
			# to block incomplete entries
			if len(dictform) == 3:
				terminals.append(dictform)
				# attention! sometimes there is no good interp in NKJP (Al in Skladnica-0.5/NKJP_1M_1305000001291/morph_1-p/morph_1.54-s.xml)
				if not good_interp:
					if not disagrinterp.has_key(file):
						disagrinterp[file] = [dictform]
					else:
						disagrinterp[file].append(dictform)
			good_interp = False
			dictform = []
		if re.search("</node>", line):
			inside_node = False
		if inside_node:
			if inside_tok:
				# necessary?
				if re.search('disamb="true"', line):
					good_interp = True
				# if good_interp:
				if re.search("<orth>.*</orth>", line):
					orth = line[6:-7].strip()
					# remove whitespace inside orth
					orth = orth.replace(' ','')
					dictform.append(orth)
					unordered.append([int(start), orth, int(end)])
				if re.search("<base>.*</base>", line):
					base = line[6:-7].strip()
					# to block entries with empty base
					if len(base) > 0:
						dictform.append(base)
				if re.search('<f type="tag">.*</f>', line):
					dictform.append(line[14:-4])
	ordered = []
	for element in sorted(unordered):
		ordered.append(element[1].encode("utf-8"))
	segmentation.write(" ".join(ordered)+"\n\n")
	# writing single test files
	open(testfilename, 'w').write(" ".join(ordered)+"\n\n")
	return terminals

def wyklucz_powtorki(terminals, dct):
	for terminal in terminals:
		token = terminal[0]
		# attention! just for testing the tokenizer!
		# token = terminal[0].lower()
		haslo = terminal[1]
		tag = terminal[2]
		if not dct.has_key(token):
			dct[token] = [terminal]
		else:
			if terminal not in dct[token]:
				dct[token].append(terminal)

if inputPaths:
    for inputPath in inputPaths:
        sent = sentFind(inputPath)
        if anyTrees(inputPath) == "OK":
            log.write(inputPath+"\n")
            localdct = {}
	    dendr_info = dendrSplit(inputPath)
	    dendr_subdir = dendr_info[0]
	    dendr_par = dendr_info[1]
	    dendr_sent = dendr_info[2]
            good.write(sent + "\n\n")
	    # replace first underscore with two (because sent type sometimes has an underscore and to distinguish from other ones)
	    basename = outpath+'FULL'+'_'+dendr_subdir+'_'+dendr_par+'_'+dendr_sent+'__'
	    # wyklucz_powtorki(processInput(inputPath), localdct)
	    wyklucz_powtorki(processInput(inputPath, basename+'TEST'), localdct)
	    # zapisz_slownik(testsuite+"-lex-dendr", localdct)
	    zapisz_slownik(basename+'DICT', localdct)
	    log.write("\n\n")
	    # # for eliminateSF
	    # segmentation.close()
# for creating Swigra batch parse files
	    # doa = "doa/" + findName(inputPath) + ".doa"
	    # open(doa, "w").write(":-analiza('" + sent + "', wypowiedzenie)." + "\n" + ":-halt.")
        else:
            comments = "\n".join(anyTrees(inputPath))
            failed.write(sent + "\n" + comments + "\n\n")

extraInfo(punct, testsuite, "punct")
extraInfo(oov, testsuite, "oov")
# extraInfo(oov_brev, testsuite, "oov_brev")
extraInfoDct(oov_brev, testsuite, "oov_brev")
# extraInfo(segf, testsuite, "segf")

# if len(segf) > 0:
# 	eliminateSF(testsuite+"-good-segments")

if len(disagrinterp) > 0:
	badinterps(disagrinterp)
