/*
 * Decompiled with CFR 0.152.
 */
package org.pageseeder.diffx.load.text;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.pageseeder.diffx.config.WhiteSpaceProcessing;
import org.pageseeder.diffx.load.text.TextTokenizer;
import org.pageseeder.diffx.load.text.Tokenizers;
import org.pageseeder.diffx.token.TextToken;
import org.pageseeder.diffx.token.impl.CharactersToken;
import org.pageseeder.diffx.token.impl.IgnorableSpaceToken;

public final class TokenizerByPunctuation
implements TextTokenizer {
    private static final String PUNCTUATION_MARKS = ".,?!;";
    private final WhiteSpaceProcessing whitespace;

    public TokenizerByPunctuation(WhiteSpaceProcessing whitespace) {
        if (whitespace == null) {
            throw new NullPointerException("the white space processing must be specified.");
        }
        this.whitespace = whitespace;
    }

    @Override
    public List<TextToken> tokenize(CharSequence text) {
        TextToken token;
        CharSequence chunk;
        if (text == null) {
            throw new NullPointerException("Character sequence is null");
        }
        if (text.length() == 0) {
            return Collections.emptyList();
        }
        ArrayList<TextToken> tokens = new ArrayList<TextToken>(text.length());
        Pattern p = Pattern.compile("[.,?!;]+");
        Matcher m = p.matcher(text);
        int index = 0;
        while (m.find()) {
            if (index < m.end()) {
                chunk = text.subSequence(index, m.end());
                tokens.add(new CharactersToken(chunk));
            }
            index = m.end();
        }
        if (index != text.length() && (token = TokenizerByPunctuation.toToken(chunk = text.subSequence(index, text.length()), this.whitespace)) != null) {
            tokens.add(token);
        }
        return tokens;
    }

    private static TextToken toToken(CharSequence text, WhiteSpaceProcessing whitespace) {
        if (Tokenizers.isWhitespace(text)) {
            return whitespace == WhiteSpaceProcessing.IGNORE ? null : new IgnorableSpaceToken(text);
        }
        return new CharactersToken(text);
    }

    public static List<TextToken> tokenize(CharSequence seq, WhiteSpaceProcessing whitespace) {
        TokenizerByPunctuation tokenizer = new TokenizerByPunctuation(whitespace);
        return tokenizer.tokenize(seq);
    }
}

