/*
 * Decompiled with CFR 0.152.
 */
package org.apache.lucene.analysis.wikipedia;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Collections;
import java.util.HashSet;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;

public class WikipediaTokenizerTest
extends BaseTokenStreamTestCase {
    protected static final String LINK_PHRASES = "click [[link here again]] click [http://lucene.apache.org here again] [[Category:a b c d]]";

    public void testSimple() throws Exception {
        String text = "This is a [[Category:foo]]";
        WikipediaTokenizer tf = new WikipediaTokenizer(WikipediaTokenizerTest.newAttributeFactory(), 0, Collections.emptySet());
        tf.setReader((Reader)new StringReader(text));
        WikipediaTokenizerTest.assertTokenStreamContents((TokenStream)tf, (String[])new String[]{"This", "is", "a", "foo"}, (int[])new int[]{0, 5, 8, 21}, (int[])new int[]{4, 7, 9, 24}, (String[])new String[]{"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "c"}, (int[])new int[]{1, 1, 1, 1}, (Integer)text.length());
    }

    public void testHandwritten() throws Exception {
        String test = "[[link]] This is a [[Category:foo]] Category  This is a linked [[:Category:bar none withstanding]] Category This is (parens) This is a [[link]]  This is an external URL [http://lucene.apache.org] Here is ''italics'' and ''more italics'', '''bold''' and '''''five quotes'''''  This is a [[link|display info]]  This is a period.  Here is $3.25 and here is 3.50.  Here's Johnny.  ==heading== ===sub head=== followed by some text  [[Category:blah| ]] ''[[Category:ital_cat]]''  here is some that is ''italics [[Category:foo]] but is never closed.'''same [[Category:foo]] goes for this '''''and2 [[Category:foo]] and this [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test] [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] <ref>Citation</ref> <sup>martian</sup> <span class=\"glue\">code</span>";
        WikipediaTokenizer tf = new WikipediaTokenizer(WikipediaTokenizerTest.newAttributeFactory(), 0, Collections.emptySet());
        tf.setReader((Reader)new StringReader(test));
        WikipediaTokenizerTest.assertTokenStreamContents((TokenStream)tf, (String[])new String[]{"link", "This", "is", "a", "foo", "Category", "This", "is", "a", "linked", "bar", "none", "withstanding", "Category", "This", "is", "parens", "This", "is", "a", "link", "This", "is", "an", "external", "URL", "http://lucene.apache.org", "Here", "is", "italics", "and", "more", "italics", "bold", "and", "five", "quotes", "This", "is", "a", "link", "display", "info", "This", "is", "a", "period", "Here", "is", "3.25", "and", "here", "is", "3.50", "Here's", "Johnny", "heading", "sub", "head", "followed", "by", "some", "text", "blah", "ital", "cat", "here", "is", "some", "that", "is", "italics", "foo", "but", "is", "never", "closed", "same", "foo", "goes", "for", "this", "and2", "foo", "and", "this", "http://foo.boo.com/test/test/", "Test", "Test", "http://foo.boo.com/test/test/test.html", "Test", "Test", "http://foo.boo.com/test/test/test.html?g=b&c=d", "Test", "Test", "Citation", "martian", "code"}, (String[])new String[]{"il", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "c", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "c", "c", "c", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "il", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "elu", "<ALPHANUM>", "<ALPHANUM>", "i", "<ALPHANUM>", "i", "i", "b", "<ALPHANUM>", "bi", "bi", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "il", "il", "il", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<APOSTROPHE>", "<ALPHANUM>", "h", "sh", "sh", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "c", "c", "c", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "i", "c", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "b", "c", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "bi", "c", "<ALPHANUM>", "<ALPHANUM>", "elu", "el", "el", "elu", "el", "el", "elu", "el", "el", "ci", "<ALPHANUM>", "<ALPHANUM>"});
    }

    public void testLinkPhrases() throws Exception {
        WikipediaTokenizer tf = new WikipediaTokenizer(WikipediaTokenizerTest.newAttributeFactory(), 0, Collections.emptySet());
        tf.setReader((Reader)new StringReader(LINK_PHRASES));
        this.checkLinkPhrases(tf);
    }

    private void checkLinkPhrases(WikipediaTokenizer tf) throws IOException {
        WikipediaTokenizerTest.assertTokenStreamContents((TokenStream)tf, (String[])new String[]{"click", "link", "here", "again", "click", "http://lucene.apache.org", "here", "again", "a", "b", "c", "d"}, (int[])new int[]{1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
    }

    public void testLinks() throws Exception {
        String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]";
        WikipediaTokenizer tf = new WikipediaTokenizer(WikipediaTokenizerTest.newAttributeFactory(), 0, Collections.emptySet());
        tf.setReader((Reader)new StringReader(test));
        WikipediaTokenizerTest.assertTokenStreamContents((TokenStream)tf, (String[])new String[]{"http://lucene.apache.org/java/docs/index.html#news", "here", "http://lucene.apache.org/java/docs/index.html?b=c", "here", "https://lucene.apache.org/java/docs/index.html?b=c", "here"}, (String[])new String[]{"elu", "el", "elu", "el", "elu", "el"});
    }

    public void testLucene1133() throws Exception {
        HashSet<String> untoks = new HashSet<String>();
        untoks.add("c");
        untoks.add("i");
        WikipediaTokenizer tf = new WikipediaTokenizer(WikipediaTokenizerTest.newAttributeFactory(), 0, untoks);
        tf.setReader((Reader)new StringReader(LINK_PHRASES));
        this.checkLinkPhrases(tf);
        String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h   i   j]]";
        tf = new WikipediaTokenizer(1, untoks);
        tf.setReader((Reader)new StringReader(test));
        WikipediaTokenizerTest.assertTokenStreamContents((TokenStream)tf, (String[])new String[]{"a b c d", "e f g", "link", "here", "link", "there", "italics here", "something", "more italics", "h   i   j"}, (int[])new int[]{11, 32, 42, 47, 56, 61, 71, 86, 98, 124}, (int[])new int[]{18, 37, 46, 51, 60, 66, 83, 95, 110, 133}, (int[])new int[]{1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
    }

    public void testBoth() throws Exception {
        HashSet<String> untoks = new HashSet<String>();
        untoks.add("c");
        untoks.add("i");
        String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h   i   j]]";
        WikipediaTokenizer tf = new WikipediaTokenizer(WikipediaTokenizerTest.newAttributeFactory(), 2, untoks);
        tf.setReader((Reader)new StringReader(test));
        WikipediaTokenizerTest.assertTokenStreamContents((TokenStream)tf, (String[])new String[]{"a b c d", "a", "b", "c", "d", "e f g", "e", "f", "g", "link", "here", "link", "there", "italics here", "italics", "here", "something", "more italics", "more", "italics", "h   i   j", "h", "i", "j"}, (int[])new int[]{11, 11, 13, 15, 17, 32, 32, 34, 36, 42, 47, 56, 61, 71, 71, 79, 86, 98, 98, 103, 124, 124, 128, 132}, (int[])new int[]{18, 12, 14, 16, 18, 37, 33, 35, 37, 46, 51, 60, 66, 83, 78, 83, 95, 110, 102, 110, 133, 125, 129, 133}, (int[])new int[]{1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1});
        tf = new WikipediaTokenizer(WikipediaTokenizerTest.newAttributeFactory(), 2, untoks);
        tf.setReader((Reader)new StringReader(test));
        int[] expectedFlags = new int[]{1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0};
        FlagsAttribute flagsAtt = (FlagsAttribute)tf.addAttribute(FlagsAttribute.class);
        tf.reset();
        for (int i = 0; i < expectedFlags.length; ++i) {
            WikipediaTokenizerTest.assertTrue((boolean)tf.incrementToken());
            WikipediaTokenizerTest.assertEquals((String)("flags " + i), (long)expectedFlags[i], (long)flagsAtt.getFlags());
        }
        WikipediaTokenizerTest.assertFalse((boolean)tf.incrementToken());
        tf.close();
    }

    public void testRandomStrings() throws Exception {
        Analyzer a = new Analyzer(){

            protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
                WikipediaTokenizer tokenizer = new WikipediaTokenizer(BaseTokenStreamTestCase.newAttributeFactory(), 0, Collections.emptySet());
                return new Analyzer.TokenStreamComponents((Tokenizer)tokenizer, (TokenStream)tokenizer);
            }
        };
        WikipediaTokenizerTest.checkRandomData((Random)WikipediaTokenizerTest.random(), (Analyzer)a, (int)(1000 * RANDOM_MULTIPLIER), (int)20, (boolean)false, (boolean)false);
        a.close();
    }

    public void testRandomHugeStrings() throws Exception {
        Random random = WikipediaTokenizerTest.random();
        Analyzer a = new Analyzer(){

            protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
                WikipediaTokenizer tokenizer = new WikipediaTokenizer(BaseTokenStreamTestCase.newAttributeFactory(), 0, Collections.emptySet());
                return new Analyzer.TokenStreamComponents((Tokenizer)tokenizer, (TokenStream)tokenizer);
            }
        };
        WikipediaTokenizerTest.checkRandomData((Random)random, (Analyzer)a, (int)(100 * RANDOM_MULTIPLIER), (int)8192, (boolean)false, (boolean)false);
        a.close();
    }
}

