/*
 * Decompiled with CFR 0.152.
 */
package org.carrot2.text.vsm;

import com.carrotsearch.hppc.BitSet;
import com.carrotsearch.hppc.IntIntHashMap;
import com.carrotsearch.hppc.sorting.IndirectComparator;
import com.carrotsearch.hppc.sorting.IndirectSort;
import org.carrot2.core.attribute.Internal;
import org.carrot2.core.attribute.Processing;
import org.carrot2.mahout.math.matrix.DoubleMatrix2D;
import org.carrot2.mahout.math.matrix.impl.DenseDoubleMatrix2D;
import org.carrot2.mahout.math.matrix.impl.SparseDoubleMatrix2D;
import org.carrot2.matrix.MatrixUtils;
import org.carrot2.text.analysis.TokenTypeUtils;
import org.carrot2.text.preprocessing.PreprocessingContext;
import org.carrot2.text.vsm.ITermWeighting;
import org.carrot2.text.vsm.LinearTfIdfTermWeighting;
import org.carrot2.text.vsm.LogTfIdfTermWeighting;
import org.carrot2.text.vsm.TfTermWeighting;
import org.carrot2.text.vsm.VectorSpaceModelContext;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Level;
import org.carrot2.util.attribute.Required;
import org.carrot2.util.attribute.constraint.DoubleRange;
import org.carrot2.util.attribute.constraint.ImplementingClasses;
import org.carrot2.util.attribute.constraint.IntRange;

@Bindable(prefix="TermDocumentMatrixBuilder")
public class TermDocumentMatrixBuilder {
    public static final String MATRIX_MODEL = "Matrix model";
    @Input
    @Processing
    @Attribute
    @DoubleRange(min=0.0, max=10.0)
    @Level(value=AttributeLevel.MEDIUM)
    @Group(value="Labels")
    public double titleWordsBoost = 2.0;
    @Input
    @Processing
    @Attribute
    @IntRange(min=5000)
    @Internal(configuration=true)
    @Level(value=AttributeLevel.ADVANCED)
    @Group(value="Matrix model")
    public int maximumMatrixSize = 37500;
    @Input
    @Processing
    @Attribute
    @DoubleRange(min=0.0, max=1.0)
    @Level(value=AttributeLevel.ADVANCED)
    @Group(value="Matrix model")
    public double maxWordDf = 0.9;
    @Input
    @Processing
    @Attribute
    @Required
    @ImplementingClasses(classes={LogTfIdfTermWeighting.class, LinearTfIdfTermWeighting.class, TfTermWeighting.class}, strict=false)
    @Level(value=AttributeLevel.ADVANCED)
    @Group(value="Matrix model")
    public ITermWeighting termWeighting = new LogTfIdfTermWeighting();

    public void buildTermDocumentMatrix(VectorSpaceModelContext vsmContext) {
        PreprocessingContext preprocessingContext = vsmContext.preprocessingContext;
        int documentCount = preprocessingContext.documents.size();
        int[] stemsTf = preprocessingContext.allStems.tf;
        int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument;
        byte[] stemsFieldIndices = preprocessingContext.allStems.fieldIndices;
        if (documentCount == 0) {
            vsmContext.termDocumentMatrix = new DenseDoubleMatrix2D(0, 0);
            vsmContext.stemToRowIndex = new IntIntHashMap();
            return;
        }
        int titleFieldIndex = -1;
        String[] fieldsName = preprocessingContext.allFields.name;
        for (int i = 0; i < fieldsName.length; ++i) {
            if (!"title".equals(fieldsName[i])) continue;
            titleFieldIndex = i;
            break;
        }
        int[] stemsToInclude = this.computeRequiredStemIndices(preprocessingContext);
        double[] stemsWeight = new double[stemsToInclude.length];
        for (int i = 0; i < stemsToInclude.length; ++i) {
            int stemIndex = stemsToInclude[i];
            stemsWeight[i] = this.termWeighting.calculateTermWeight(stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount) * this.getWeightBoost(titleFieldIndex, stemsFieldIndices[stemIndex]);
        }
        int[] stemWeightOrder = IndirectSort.mergesort((int)0, (int)stemsWeight.length, (IndirectComparator)new IndirectComparator.DescendingDoubleComparator(stemsWeight));
        int maxRows = this.maximumMatrixSize / documentCount;
        DenseDoubleMatrix2D tdMatrix = new DenseDoubleMatrix2D(Math.min(maxRows, stemsToInclude.length), documentCount);
        for (int i = 0; i < stemWeightOrder.length && i < maxRows; ++i) {
            int stemIndex = stemsToInclude[stemWeightOrder[i]];
            int[] tfByDocument = stemsTfByDocument[stemIndex];
            int df = tfByDocument.length / 2;
            byte fieldIndices = stemsFieldIndices[stemIndex];
            for (int j = 0; j < df; ++j) {
                double weight = this.termWeighting.calculateTermWeight(tfByDocument[j * 2 + 1], df, documentCount);
                tdMatrix.set(i, tfByDocument[j * 2], weight *= this.getWeightBoost(titleFieldIndex, fieldIndices));
            }
        }
        IntIntHashMap stemToRowIndex = new IntIntHashMap();
        for (int i = 0; i < stemWeightOrder.length && i < tdMatrix.rows(); ++i) {
            stemToRowIndex.put(stemsToInclude[stemWeightOrder[i]], i);
        }
        vsmContext.termDocumentMatrix = tdMatrix;
        vsmContext.stemToRowIndex = stemToRowIndex;
    }

    public void buildTermPhraseMatrix(VectorSpaceModelContext context) {
        PreprocessingContext preprocessingContext = context.preprocessingContext;
        IntIntHashMap stemToRowIndex = context.stemToRowIndex;
        int[] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex;
        int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex;
        if (firstPhraseIndex >= 0 && stemToRowIndex.size() > 0) {
            int[] phraseFeatureIndices = new int[labelsFeatureIndex.length - firstPhraseIndex];
            for (int featureIndex = 0; featureIndex < phraseFeatureIndices.length; ++featureIndex) {
                phraseFeatureIndices[featureIndex] = labelsFeatureIndex[featureIndex + firstPhraseIndex];
            }
            DoubleMatrix2D phraseMatrix = TermDocumentMatrixBuilder.buildAlignedMatrix(context, phraseFeatureIndices, this.termWeighting);
            MatrixUtils.normalizeColumnL2(phraseMatrix, null);
            context.termPhraseMatrix = phraseMatrix.viewDice();
        }
    }

    private double getWeightBoost(int titleFieldIndex, byte fieldIndices) {
        if ((fieldIndices & 1 << titleFieldIndex) != 0) {
            return this.titleWordsBoost;
        }
        return 1.0;
    }

    private int[] computeRequiredStemIndices(PreprocessingContext context) {
        int[] labelsFeatureIndex = context.allLabels.featureIndex;
        int[] wordsStemIndex = context.allWords.stemIndex;
        short[] wordsTypes = context.allWords.type;
        int[][] phrasesWordIndices = context.allPhrases.wordIndices;
        int wordCount = wordsStemIndex.length;
        int[][] stemsTfByDocument = context.allStems.tfByDocument;
        int documentCount = context.documents.size();
        BitSet requiredStemIndices = new BitSet((long)labelsFeatureIndex.length);
        for (int i = 0; i < labelsFeatureIndex.length; ++i) {
            int featureIndex = labelsFeatureIndex[i];
            if (featureIndex < wordCount) {
                this.addStemIndex(wordsStemIndex, documentCount, stemsTfByDocument, requiredStemIndices, featureIndex);
                continue;
            }
            int[] wordIndices = phrasesWordIndices[featureIndex - wordCount];
            for (int j = 0; j < wordIndices.length; ++j) {
                int wordIndex = wordIndices[j];
                if (TokenTypeUtils.isCommon(wordsTypes[wordIndex])) continue;
                this.addStemIndex(wordsStemIndex, documentCount, stemsTfByDocument, requiredStemIndices, wordIndex);
            }
        }
        return requiredStemIndices.asIntLookupContainer().toArray();
    }

    private void addStemIndex(int[] wordsStemIndex, int documentCount, int[][] stemsTfByDocument, BitSet requiredStemIndices, int featureIndex) {
        int stemIndex = wordsStemIndex[featureIndex];
        int df = stemsTfByDocument[stemIndex].length / 2;
        if ((double)df / (double)documentCount <= this.maxWordDf) {
            requiredStemIndices.set((long)stemIndex);
        }
    }

    static DoubleMatrix2D buildAlignedMatrix(VectorSpaceModelContext vsmContext, int[] featureIndex, ITermWeighting termWeighting) {
        IntIntHashMap stemToRowIndex = vsmContext.stemToRowIndex;
        if (featureIndex.length == 0) {
            return new DenseDoubleMatrix2D(stemToRowIndex.size(), 0);
        }
        SparseDoubleMatrix2D phraseMatrix = new SparseDoubleMatrix2D(stemToRowIndex.size(), featureIndex.length);
        PreprocessingContext preprocessingContext = vsmContext.preprocessingContext;
        int[] wordsStemIndex = preprocessingContext.allWords.stemIndex;
        int[] stemsTf = preprocessingContext.allStems.tf;
        int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument;
        int[][] phrasesWordIndices = preprocessingContext.allPhrases.wordIndices;
        int documentCount = preprocessingContext.documents.size();
        int wordCount = wordsStemIndex.length;
        for (int i = 0; i < featureIndex.length; ++i) {
            int feature = featureIndex[i];
            int[] wordIndices = feature < wordCount ? new int[]{feature} : phrasesWordIndices[feature - wordCount];
            for (int wordIndex = 0; wordIndex < wordIndices.length; ++wordIndex) {
                int stemIndex = wordsStemIndex[wordIndices[wordIndex]];
                int index = stemToRowIndex.indexOf(stemIndex);
                if (!stemToRowIndex.indexExists(index)) continue;
                int rowIndex = stemToRowIndex.indexGet(index);
                double weight = termWeighting.calculateTermWeight(stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount);
                ((DoubleMatrix2D)phraseMatrix).setQuick(rowIndex, i, weight);
            }
        }
        return phraseMatrix;
    }
}

