/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.international.spanish.pipeline;

import edu.stanford.nlp.international.spanish.SpanishVerbStripper;
import edu.stanford.nlp.international.spanish.pipeline.MultiWordTreeExpander;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.stats.TwoDimensionalCounter;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeNormalizer;
import edu.stanford.nlp.trees.TreeReader;
import edu.stanford.nlp.trees.international.spanish.SpanishTreeNormalizer;
import edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;

public final class MultiWordPreprocessor {
    private static Redwood.RedwoodChannels log = Redwood.channels(MultiWordPreprocessor.class);
    private static int nMissingPOS;
    private static int nMissingPhrasal;
    private static int nFixedPOS;
    private static int nFixedPhrasal;
    private static final Map<String, String> phrasalCategoryMap;
    private static final SpanishVerbStripper verbStripper;
    private static final Map<String, Integer> argOptionDefs;

    public static void updateTagger(TwoDimensionalCounter<String, String> tagger, Tree t) {
        List<CoreLabel> yield = t.taggedLabeledYield();
        for (CoreLabel cl : yield) {
            if (cl.tag().equals("MW?")) continue;
            tagger.incrementCount(cl.word(), cl.tag());
        }
    }

    public static void traverseAndFix(Tree t, Tree parent, TwoDimensionalCounter<String, String> unigramTagger, boolean retainNER) {
        if (t.isPreTerminal()) {
            if (t.value().equals("MW?")) {
                ++nMissingPOS;
                String pos = MultiWordPreprocessor.inferPOS(t, parent, unigramTagger);
                if (pos != null) {
                    t.setValue(pos);
                    ++nFixedPOS;
                }
            }
            return;
        }
        for (Tree kid : t.children()) {
            MultiWordPreprocessor.traverseAndFix(kid, t, unigramTagger, retainNER);
        }
        if (t.value().startsWith("MW_PHRASE?")) {
            ++nMissingPhrasal;
            String phrasalCat = MultiWordPreprocessor.inferPhrasalCategory(t, retainNER);
            if (phrasalCat != null) {
                t.setValue(phrasalCat);
                ++nFixedPhrasal;
            }
        }
    }

    private static String getContainingPhrase(Tree t, Tree parent) {
        if (parent == null) {
            return null;
        }
        ArrayList<Label> phraseYield = parent.yield();
        StringBuilder containingPhrase = new StringBuilder();
        for (Label l : phraseYield) {
            containingPhrase.append(l.value()).append(" ");
        }
        return containingPhrase.toString().substring(0, containingPhrase.length() - 1);
    }

    private static String inferPOS(Tree t, Tree parent, TwoDimensionalCounter<String, String> unigramTagger) {
        String pos;
        String containingPhraseStr;
        String word = t.firstChild().value();
        String overrideTag = ManualUWModel.getOverrideTag(word, containingPhraseStr = MultiWordPreprocessor.getContainingPhrase(t, parent));
        if (overrideTag != null) {
            return overrideTag;
        }
        Set<String> unigramTaggerKeys = unigramTagger.firstKeySet();
        SpanishVerbStripper.StrippedVerb strippedVerb = verbStripper.separatePronouns(word);
        if (strippedVerb != null && unigramTaggerKeys.contains(strippedVerb.getStem()) && (pos = (String)Counters.argmax(unigramTagger.getCounter((Object)strippedVerb.getStem()))).startsWith("v")) {
            return pos;
        }
        if (unigramTagger.firstKeySet().contains(word)) {
            return Counters.argmax(unigramTagger.getCounter((Object)word), new POSTieBreaker());
        }
        return ManualUWModel.getTag(word, containingPhraseStr);
    }

    private static String inferPhrasalCategory(Tree t, boolean retainNER) {
        String phraseValue = t.value();
        String originalPos = phraseValue.substring(phraseValue.lastIndexOf(95) + 1);
        if (phrasalCategoryMap.containsKey(originalPos)) {
            return phrasalCategoryMap.get(originalPos);
        }
        if (originalPos.length() > 0 && originalPos.charAt(0) == 'n') {
            if (!retainNER) {
                return "grup.nom";
            }
            char nerTag = phraseValue.charAt(phraseValue.length() - 1);
            switch (nerTag) {
                case 'l': {
                    return "grup.nom.lug";
                }
                case 'o': {
                    return "grup.nom.org";
                }
                case 'p': {
                    return "grup.nom.pers";
                }
                case '0': {
                    return "grup.nom.otros";
                }
            }
            return "grup.nom";
        }
        StringBuilder sb = new StringBuilder();
        for (Tree kid : t.children()) {
            sb.append(kid.value()).append(" ");
        }
        String posSequence = sb.toString().trim();
        log.info("No phrasal cat for: " + posSequence + " (original POS of MWE: " + originalPos + ")");
        return null;
    }

    private static void resolveDummyTags(File treeFile, TwoDimensionalCounter<String, String> unigramTagger, boolean retainNER, TreeNormalizer tn) {
        LabeledScoredTreeFactory tf = new LabeledScoredTreeFactory();
        MultiWordTreeExpander expander = new MultiWordTreeExpander();
        try {
            Tree t;
            BufferedReader br = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(treeFile), "UTF-8"));
            SpanishTreeReaderFactory trf = new SpanishTreeReaderFactory();
            TreeReader tr = trf.newTreeReader(br);
            PrintWriter pw = new PrintWriter(new PrintStream((OutputStream)new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
            int nTrees = 0;
            while ((t = tr.readTree()) != null) {
                MultiWordPreprocessor.traverseAndFix(t, null, unigramTagger, retainNER);
                t = expander.expandPhrases(t, tn, tf);
                if (tn != null) {
                    t = tn.normalizeWholeTree(t, tf);
                }
                pw.println(t.toString());
                ++nTrees;
            }
            pw.close();
            tr.close();
            System.out.println("Processed " + nTrees + " trees");
        }
        catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }

    private static String usage() {
        StringBuilder sb = new StringBuilder();
        String nl = System.getProperty("line.separator");
        sb.append(String.format("Usage: java %s [OPTIONS] treebank-file%n", MultiWordPreprocessor.class.getName()));
        sb.append("Options:").append(nl);
        sb.append("   -help: Print this message").append(nl);
        sb.append("   -ner: Retain NER information in tree constituents (pre-pre-terminal nodes)").append(nl);
        sb.append("   -normalize {true, false}: Run the Spanish tree normalizer (non-aggressive) on the output of the main routine (true by default)").append(nl);
        return sb.toString();
    }

    public static void main(String[] args) {
        Properties options = StringUtils.argsToProperties(args, argOptionDefs);
        if (!options.containsKey("") || options.containsKey("help")) {
            log.info(MultiWordPreprocessor.usage());
            return;
        }
        boolean retainNER = PropertiesUtils.getBool(options, "ner", false);
        boolean normalize = PropertiesUtils.getBool(options, "normalize", true);
        File treeFile = new File(options.getProperty(""));
        TwoDimensionalCounter labelTerm = new TwoDimensionalCounter();
        TwoDimensionalCounter termLabel = new TwoDimensionalCounter();
        TwoDimensionalCounter labelPreterm = new TwoDimensionalCounter();
        TwoDimensionalCounter pretermLabel = new TwoDimensionalCounter();
        TwoDimensionalCounter<String, String> unigramTagger = new TwoDimensionalCounter<String, String>();
        try {
            Tree t;
            BufferedReader br = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(treeFile), "UTF-8"));
            SpanishTreeReaderFactory trf = new SpanishTreeReaderFactory();
            TreeReader tr = trf.newTreeReader(br);
            while ((t = tr.readTree()) != null) {
                MultiWordPreprocessor.updateTagger(unigramTagger, t);
            }
            tr.close();
            System.out.println("Resolving DUMMY tags");
            MultiWordPreprocessor.resolveDummyTags(treeFile, unigramTagger, retainNER, normalize ? new SpanishTreeNormalizer(true, false, false) : null);
            System.out.println("#Unknown Word Types: " + ManualUWModel.nUnknownWordTypes);
            System.out.println(String.format("#Missing POS: %d (fixed: %d, %.2f%%)", nMissingPOS, nFixedPOS, (double)nFixedPOS / (double)nMissingPOS * 100.0));
            System.out.println(String.format("#Missing Phrasal: %d (fixed: %d, %.2f%%)", nMissingPhrasal, nFixedPhrasal, (double)nFixedPhrasal / (double)nMissingPhrasal * 100.0));
            System.out.println("Done!");
        }
        catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }

    static {
        phrasalCategoryMap = new HashMap<String, String>();
        phrasalCategoryMap.put("ao0000", "grup.a");
        phrasalCategoryMap.put("aq0000", "grup.a");
        phrasalCategoryMap.put("aqo000", "grup.a");
        phrasalCategoryMap.put("da0000", "spec");
        phrasalCategoryMap.put("di0000", "sn");
        phrasalCategoryMap.put("dn0000", "spec");
        phrasalCategoryMap.put("dt0000", "spec");
        phrasalCategoryMap.put("i", "interjeccio");
        phrasalCategoryMap.put("i00", "interjeccio");
        phrasalCategoryMap.put("rg", "grup.adv");
        phrasalCategoryMap.put("rn", "grup.adv");
        phrasalCategoryMap.put("vaip000", "grup.verb");
        phrasalCategoryMap.put("vmg0000", "grup.verb");
        phrasalCategoryMap.put("vmic000", "grup.verb");
        phrasalCategoryMap.put("vmii000", "grup.verb");
        phrasalCategoryMap.put("vmif000", "grup.verb");
        phrasalCategoryMap.put("vmip000", "grup.verb");
        phrasalCategoryMap.put("vmis000", "grup.verb");
        phrasalCategoryMap.put("vmm0000", "grup.verb");
        phrasalCategoryMap.put("vmn0000", "grup.verb");
        phrasalCategoryMap.put("vmp0000", "grup.verb");
        phrasalCategoryMap.put("vmsi000", "grup.verb");
        phrasalCategoryMap.put("vmsp000", "grup.verb");
        phrasalCategoryMap.put("zm", "grup.nom");
        phrasalCategoryMap.put("cc", "grup.cc");
        phrasalCategoryMap.put("cs", "grup.cs");
        phrasalCategoryMap.put("pn000000", "grup.nom");
        phrasalCategoryMap.put("pi000000", "grup.pron");
        phrasalCategoryMap.put("pr000000", "grup.pron");
        phrasalCategoryMap.put("pt000000", "grup.pron");
        phrasalCategoryMap.put("px000000", "grup.pron");
        phrasalCategoryMap.put("sp000", "grup.prep");
        phrasalCategoryMap.put("w", "grup.w");
        phrasalCategoryMap.put("z", "grup.z");
        phrasalCategoryMap.put("z0", "grup.z");
        phrasalCategoryMap.put("zp", "grup.z");
        phrasalCategoryMap.put("zu", "grup.z");
        verbStripper = SpanishVerbStripper.getInstance();
        argOptionDefs = Generics.newHashMap();
        argOptionDefs.put("help", 0);
        argOptionDefs.put("ner", 0);
        argOptionDefs.put("normalize", 1);
    }

    private static class POSTieBreaker
    implements Comparator<String> {
        private POSTieBreaker() {
        }

        @Override
        public int compare(String o1, String o2) {
            boolean firstIsNoun = o1.startsWith("n");
            boolean secondIsNoun = o2.startsWith("n");
            if (firstIsNoun && !secondIsNoun) {
                return -1;
            }
            if (secondIsNoun && !firstIsNoun) {
                return 1;
            }
            return 0;
        }
    }

    private static class ManualUWModel {
        private static final Map<String, String> posMap = new HashMap<String, String>();
        private static int nUnknownWordTypes;
        private static final Pattern digit;
        private static final Pattern participle;
        private static final Set<String> actuallyNames;
        private static final Pattern otherNamePattern;
        private static final Pattern otherNamePattern2;
        private static final Pattern pPronounDeterminers;
        private static final Pattern commonPattern;

        private ManualUWModel() {
        }

        public static String getOverrideTag(String word, String containingPhrase) {
            if (containingPhrase == null) {
                return null;
            }
            if (word.equalsIgnoreCase("este") && !containingPhrase.startsWith(word)) {
                return "np00000";
            }
            if (word.equals("contra") && (containingPhrase.startsWith("en contra") || containingPhrase.startsWith("En contra"))) {
                return "nc0s000";
            }
            if (word.equals("total") && containingPhrase.startsWith("ese")) {
                return "nc0s000";
            }
            if (word.equals("DEL")) {
                return "sp000";
            }
            if (word.equals("s\u00ed") && containingPhrase.contains("por s\u00ed") || containingPhrase.contains("fuera de s\u00ed")) {
                return "pp000000";
            }
            if (pPronounDeterminers.matcher(word).matches() && containingPhrase.endsWith(word)) {
                return "pi000000";
            }
            if (word.equals("cuando") && containingPhrase.endsWith(word)) {
                return "pi000000";
            }
            if (word.equalsIgnoreCase("contra") && containingPhrase.endsWith(word)) {
                return "nc0s000";
            }
            if (word.equals("salvo") && containingPhrase.endsWith("salvo")) {
                return "aq0000";
            }
            if (word.equals("mira") && containingPhrase.endsWith(word)) {
                return "nc0s000";
            }
            if (word.equals("pro") && containingPhrase.startsWith("en pro")) {
                return "nc0s000";
            }
            if (word.equals("espera") && containingPhrase.endsWith("espera de")) {
                return "nc0s000";
            }
            if (word.equals("Paso") && containingPhrase.equals("El Paso")) {
                return "np00000";
            }
            if (word.equals("medio") && (containingPhrase.endsWith("medio de") || containingPhrase.endsWith("ambiente") || containingPhrase.endsWith("por medio") || containingPhrase.contains("por medio") || containingPhrase.endsWith("medio"))) {
                return "nc0s000";
            }
            if (word.equals("Medio") && containingPhrase.contains("Ambiente")) {
                return "nc0s000";
            }
            if (word.equals("Medio") && containingPhrase.equals("Oriente Medio")) {
                return "aq0000";
            }
            if (word.equals("media") && containingPhrase.equals("mass media")) {
                return "nc0n000";
            }
            if (word.equals("cuenta")) {
                return "nc0s000";
            }
            if (word.equals("h") && containingPhrase.startsWith("km")) {
                return "zu";
            }
            if (word.equals("A") && (containingPhrase.contains("-") || containingPhrase.contains(",") || otherNamePattern2.matcher(containingPhrase).find() || containingPhrase.equals("terminal A"))) {
                return "np00000";
            }
            if (word.equals("forma") && containingPhrase.startsWith("forma parte")) {
                return "vmip000";
            }
            if (word.equals("Sin") && containingPhrase.contains("Jaime")) {
                return "np00000";
            }
            if (word.equals("di") && containingPhrase.contains("di cuenta")) {
                return "vmis000";
            }
            if (word.equals("demos") && containingPhrase.contains("demos cuenta")) {
                return "vmsp000";
            }
            if ((word.equals("van") || word.equals("den")) && containingPhrase.contains("van den")) {
                return "np00000";
            }
            if (word.equals("Al")) {
                if (otherNamePattern.matcher(containingPhrase).find()) {
                    return "np00000";
                }
                return "sp000";
            }
            if (actuallyNames.contains(word)) {
                return "np00000";
            }
            if (word.equals("sino") && containingPhrase.endsWith(word)) {
                return "nc0s000";
            }
            if (word.equals("ma\u00f1ana") || word.equals("paso") || word.equals("monta") || word.equals("deriva") || word.equals("visto")) {
                return "nc0s000";
            }
            if (word.equals("frente") && containingPhrase.startsWith("al frente")) {
                return "nc0s000";
            }
            return null;
        }

        public static String getTag(String word, String containingPhrase) {
            if (word.equals("%")) {
                return "ft";
            }
            if (word.equals("+")) {
                return "fz";
            }
            if (word.equals("&") || word.equals("@")) {
                return "f0";
            }
            if (digit.matcher(word).find()) {
                return "z0";
            }
            if (posMap.containsKey(word)) {
                return posMap.get(word);
            }
            if (participle.matcher(word).find()) {
                return "aq0000";
            }
            if (commonPattern.matcher(word).matches()) {
                return "ncms000";
            }
            return "np00000";
        }

        static {
            posMap.put("c\u00fabico", "aq0000");
            posMap.put("c\u00fabicos", "aq0000");
            posMap.put("diagonal", "aq0000");
            posMap.put("diestro", "aq0000");
            posMap.put("llevados", "aq0000");
            posMap.put("llevadas", "aq0000");
            posMap.put("menudo", "aq0000");
            posMap.put("obstante", "aq0000");
            posMap.put("rapadas", "aq0000");
            posMap.put("rasa", "aq0000");
            posMap.put("s\u00fabito", "aq0000");
            posMap.put("tem\u00e1tica", "aq0000");
            posMap.put("tuya", "px000000");
            posMap.put("alter", "nc0s000");
            posMap.put("ego", "nc0s000");
            posMap.put("Jet", "nc0s000");
            posMap.put("lag", "nc0s000");
            posMap.put("line", "nc0s000");
            posMap.put("lord", "nc0s000");
            posMap.put("model", "nc0s000");
            posMap.put("mortem", "nc0s000");
            posMap.put("pater", "nc0s000");
            posMap.put("pipe", "nc0s000");
            posMap.put("play", "nc0s000");
            posMap.put("pollastre", "nc0s000");
            posMap.put("post", "nc0s000");
            posMap.put("power", "nc0s000");
            posMap.put("priori", "nc0s000");
            posMap.put("rock", "nc0s000");
            posMap.put("roll", "nc0s000");
            posMap.put("salubritatis", "nc0s000");
            posMap.put("savoir", "nc0s000");
            posMap.put("service", "nc0s000");
            posMap.put("status", "nc0s000");
            posMap.put("stem", "nc0s000");
            posMap.put("street", "nc0s000");
            posMap.put("task", "nc0s000");
            posMap.put("trio", "nc0s000");
            posMap.put("zigzag", "nc0s000");
            posMap.put("mass", "nc0n000");
            posMap.put("media", "nc0n000");
            posMap.put("options", "nc0p000");
            posMap.put("rega\u00f1adientes", "nc0n000");
            posMap.put("sabiendas", "nc0n000");
            posMap.put("virgen", "nc0s000");
            posMap.put("merced", "ncfs000");
            posMap.put("miel", "ncfs000");
            posMap.put("torera", "ncfs000");
            posMap.put("ultranza", "ncfs000");
            posMap.put("v\u00edsperas", "ncfs000");
            posMap.put("acecho", "ncms000");
            posMap.put("alzamiento", "ncms000");
            posMap.put("bordo", "ncms000");
            posMap.put("c\u00e1pita", "ncms000");
            posMap.put("ciento", "ncms000");
            posMap.put("cu\u00f1o", "ncms000");
            posMap.put("pairo", "ncms000");
            posMap.put("pese", "ncms000");
            posMap.put("pique", "ncms000");
            posMap.put("pos", "ncms000");
            posMap.put("postre", "ncms000");
            posMap.put("pro", "ncms000");
            posMap.put("ralent\u00ed", "ncms000");
            posMap.put("ras", "ncms000");
            posMap.put("rebato", "ncms000");
            posMap.put("torno", "ncms000");
            posMap.put("trav\u00e9s", "ncms000");
            posMap.put("creces", "ncfp000");
            posMap.put("cuestas", "ncfp000");
            posMap.put("o\u00eddas", "ncfp000");
            posMap.put("tientas", "ncfp000");
            posMap.put("trizas", "ncfp000");
            posMap.put("veras", "ncfp000");
            posMap.put("abuelos", "ncmp000");
            posMap.put("ambages", "ncmp000");
            posMap.put("modos", "ncmp000");
            posMap.put("pedazos", "ncmp000");
            posMap.put("A", "sps00");
            posMap.put("am\u00e9n", "rg");
            posMap.put("Bailando", "vmg0000");
            posMap.put("So\u00f1ando", "vmg0000");
            posMap.put("Teniendo", "vmg0000");
            posMap.put("echaremos", "vmif000");
            posMap.put("formaba", "vmii000");
            posMap.put("Formabas", "vmii000");
            posMap.put("Forman", "vmip000");
            posMap.put("perece", "vmip000");
            posMap.put("PONE", "vmip000");
            posMap.put("suic\u00eddate", "vmm0000");
            posMap.put("tardar", "vmn0000");
            posMap.put("seiscientas", "z0");
            posMap.put("trescientas", "z0");
            posMap.put("cc", "zu");
            posMap.put("km", "zu");
            posMap.put("kms", "zu");
            nUnknownWordTypes = posMap.size();
            digit = Pattern.compile("\\d+");
            participle = Pattern.compile("[ai]d[oa]$");
            actuallyNames = new HashSet<String>(Arrays.asList("Avenida", "Contra", "Gracias", "in", "Mercado", "Jes\u00fas", "Salvo", "Van"));
            otherNamePattern = Pattern.compile("\\b(Al\\w+|A[^l]\\w*|[B-Z]\\w+)");
            otherNamePattern2 = Pattern.compile("\\b(A\\w+|[B-Z]\\w+)");
            pPronounDeterminers = Pattern.compile("(tod|otr|un)[oa]s?");
            commonPattern = Pattern.compile("^al? |^en .+ de$|sin | al?$| que$", 2);
        }
    }
}

