package cc.mallet.util;

import cc.mallet.pipe.CharSequenceLowercase;
import cc.mallet.pipe.FeatureCountPipe;
import cc.mallet.pipe.FixedVocabTokenizer;
import cc.mallet.pipe.NGramPreprocessor;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.pipe.SimpleTokenizer;
import cc.mallet.pipe.StringList2FeatureSequence;
import cc.mallet.pipe.iterator.CsvIterator;
import cc.mallet.types.Alphabet;
import cc.mallet.types.AlphabetFactory;
import cc.mallet.types.Instance;
import cc.mallet.util.CommandOption;
import edu.stanford.nlp.ling.tokensregex.types.Expressions;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.logging.Logger;

/* loaded from: input_file:cc/mallet/util/DBBulkLoader.class */
public class DBBulkLoader {
    protected static Logger logger = MalletLogger.getLogger(DBBulkLoader.class.getName());
    static CommandOption.SpacedStrings inputFiles = new CommandOption.SpacedStrings(DBBulkLoader.class, "input", "FILE [FILE ...]", true, null, "The file containing data, one instance per line", null);
    static CommandOption.String outputDatabase = new CommandOption.String(DBBulkLoader.class, "output", Expressions.TYPE_STRING, true, "mallet-db", "Write the instance list to this database", null);
    static CommandOption.Boolean preserveCase = new CommandOption.Boolean(DBBulkLoader.class, "preserve-case", "[TRUE|FALSE]", false, false, "If true, do not force all strings to lowercase.", null);
    static CommandOption.File vocabularyFile = new CommandOption.File(DBBulkLoader.class, "vocabulary", "FILE", true, null, "Read newline-separated words from this file.", null);
    static CommandOption.SpacedStrings replacementFiles = new CommandOption.SpacedStrings(DBBulkLoader.class, "replacement-files", "FILE [FILE ...]", true, null, "files containing string replacements, one per line:\n\t 'A B [tab] C' replaces A B with C,\n\t 'A B' replaces A B with A_B", null);
    static CommandOption.SpacedStrings deletionFiles = new CommandOption.SpacedStrings(DBBulkLoader.class, "deletion-files", "FILE [FILE ...]", true, null, "files containing strings to delete after replacements but before tokenization (ie multiword stop terms)", null);
    static CommandOption.File stoplistFile = new CommandOption.File(DBBulkLoader.class, "stoplist", "FILE", true, null, "Read newline-separated words from this file and remove them from text.", null);
    static CommandOption.Boolean keepSequence = new CommandOption.Boolean(DBBulkLoader.class, "keep-sequence", "[TRUE|FALSE]", false, true, "If true, final data will be a FeatureSequence rather than a FeatureVector.", null);
    static CommandOption.Integer pruneCount = new CommandOption.Integer(DBBulkLoader.class, "prune-count", "N", false, 0, "Reduce features to those that occur more than N times.", null);

    public static void generateStoplist(SimpleTokenizer simpleTokenizer, NGramPreprocessor nGramPreprocessor) throws IOException {
        ArrayList arrayList = new ArrayList();
        Alphabet alphabet = new Alphabet();
        SimpleTokenizer deepClone = simpleTokenizer.deepClone();
        StringList2FeatureSequence stringList2FeatureSequence = new StringList2FeatureSequence(alphabet);
        FeatureCountPipe featureCountPipe = new FeatureCountPipe(alphabet, null);
        arrayList.add(nGramPreprocessor);
        arrayList.add(deepClone);
        arrayList.add(stringList2FeatureSequence);
        arrayList.add(featureCountPipe);
        SerialPipes serialPipes = new SerialPipes(arrayList);
        for (String str : inputFiles.value) {
            logger.info("pruning from " + str);
            Iterator<Instance> newIteratorFrom = serialPipes.newIteratorFrom(new CsvIterator(new FileReader(str), "(.*?)\\t(.*?)\\t(.*)", 3, 2, 1));
            int i = 0;
            while (newIteratorFrom.hasNext()) {
                i++;
                if (i % 100000 == 0) {
                    System.out.println(i);
                }
                newIteratorFrom.next();
            }
        }
        featureCountPipe.addPrunedWordsToStoplist(simpleTokenizer, pruneCount.value);
    }

    public static void writeInstanceList(ArrayList<Pipe> arrayList) throws Exception {
        SerialPipes serialPipes = new SerialPipes(arrayList);
        DBInstanceStore dBInstanceStore = new DBInstanceStore(outputDatabase.value);
        for (String str : inputFiles.value) {
            logger.info("importing from " + str);
            dBInstanceStore.saveInstances(serialPipes.newIteratorFrom(new CsvIterator(new FileReader(str), "(.*?)\\t(.*?)\\t(.*)", 3, 2, 1)));
        }
        dBInstanceStore.saveAlphabets(serialPipes.getDataAlphabet(), serialPipes.getTargetAlphabet());
        dBInstanceStore.cleanup();
    }

    public static void main(String[] strArr) throws Exception {
        logger.info("starting");
        CommandOption.setSummary(DBBulkLoader.class, "Efficient tool for importing large amounts of text and saving to an embedded Java database");
        CommandOption.process(DBBulkLoader.class, strArr);
        NGramPreprocessor nGramPreprocessor = new NGramPreprocessor();
        if (replacementFiles.value != null) {
            for (String str : replacementFiles.value) {
                System.out.println("including replacements from " + str);
                nGramPreprocessor.loadReplacements(str);
            }
        }
        if (deletionFiles.value != null) {
            for (String str2 : deletionFiles.value) {
                System.out.println("including deletions from " + str2);
                nGramPreprocessor.loadDeletions(str2);
            }
        }
        if (vocabularyFile.value != null) {
            Alphabet loadFromFile = AlphabetFactory.loadFromFile(vocabularyFile.value);
            loadFromFile.stopGrowth();
            logger.info("loaded alphabet of size " + loadFromFile.size());
            ArrayList arrayList = new ArrayList();
            arrayList.add(nGramPreprocessor);
            arrayList.add(new FixedVocabTokenizer(loadFromFile));
            writeInstanceList(arrayList);
            return;
        }
        SimpleTokenizer simpleTokenizer = new SimpleTokenizer(stoplistFile.value);
        if (pruneCount.value > 0) {
            generateStoplist(simpleTokenizer, nGramPreprocessor);
        }
        ArrayList arrayList2 = new ArrayList();
        Alphabet alphabet = new Alphabet();
        new CharSequenceLowercase();
        StringList2FeatureSequence stringList2FeatureSequence = new StringList2FeatureSequence(alphabet);
        arrayList2.add(nGramPreprocessor);
        arrayList2.add(simpleTokenizer);
        arrayList2.add(stringList2FeatureSequence);
        writeInstanceList(arrayList2);
    }
}
