"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.RegExpMatcher = void 0;
const Char_1 = require("../../util/Char");
const Util_1 = require("../../pattern/Util");
const TransformerSet_1 = require("../../transformer/TransformerSet");
const CharacterIterator_1 = require("../../util/CharacterIterator");
const IntervalCollection_1 = require("../IntervalCollection");
const MatchPayload_1 = require("../MatchPayload");
/**
 * An implementation of the [[Matcher]] interface using regular expressions and
 * string searching methods.
 */
class RegExpMatcher {
    /**
     * Creates a new [[RegExpMatcher]] with the options given.
     *
     * @example
     * ```typescript
     * // Use the options provided by the English preset.
     * const matcher = new RegExpMatcher({
     * 	...englishDataset.build(),
     * 	...englishRecommendedTransformers,
     * });
     * ```
     * @example
     * ```typescript
     * // Simple matcher that only has blacklisted patterns.
     * const matcher = new RegExpMatcher({
     *  blacklistedTerms: assignIncrementingIds([
     *      pattern`fuck`,
     *      pattern`f?uck`, // wildcards (?)
     *      pattern`bitch`,
     *      pattern`b[i]tch` // optionals ([i] matches either "i" or "")
     *  ]),
     * });
     *
     * // Check whether some string matches any of the patterns.
     * const doesMatch = matcher.hasMatch('fuck you bitch');
     * ```
     * @example
     * ```typescript
     * // A more advanced example, with transformers and whitelisted terms.
     * const matcher = new RegExpMatcher({
     *  blacklistedTerms: [
     *      { id: 1, pattern: pattern`penis` },
     *      { id: 2, pattern: pattern`fuck` },
     *  ],
     *  whitelistedTerms: ['pen is'],
     *  blacklistMatcherTransformers: [
     *      resolveConfusablesTransformer(), // '🅰' => 'a'
     *      resolveLeetSpeakTransformer(), // '$' => 's'
     *      foldAsciiCharCaseTransformer(), // case insensitive matching
     *      skipNonAlphabeticTransformer(), // 'f.u...c.k' => 'fuck'
     *      collapseDuplicatesTransformer(), // 'aaaa' => 'a'
     *  ],
     * });
     *
     * // Output all matches.
     * console.log(matcher.getAllMatches('fu.....uuuuCK the pen is mightier than the sword!'));
     * ```
     * @param options - Options to use.
     */
    constructor({ blacklistedTerms, whitelistedTerms = [], blacklistMatcherTransformers = [], whitelistMatcherTransformers = [], }) {
        this.blacklistedTerms = this.compileTerms(blacklistedTerms);
        this.validateWhitelistedTerms(whitelistedTerms);
        this.whitelistedTerms = whitelistedTerms;
        this.blacklistMatcherTransformers = new TransformerSet_1.TransformerSet(blacklistMatcherTransformers);
        this.whitelistMatcherTransformers = new TransformerSet_1.TransformerSet(whitelistMatcherTransformers);
    }
    getAllMatches(input, sorted = false) {
        const whitelistedIntervals = this.getWhitelistedIntervals(input);
        const [transformedToOrigIndex, transformed] = this.applyTransformers(input, this.blacklistMatcherTransformers);
        const matches = [];
        for (const blacklistedTerm of this.blacklistedTerms) {
            for (const match of transformed.matchAll(blacklistedTerm.regExp)) {
                const origStartIndex = transformedToOrigIndex[match.index];
                let origEndIndex = transformedToOrigIndex[match.index + match[0].length - 1];
                // End index is (unfortunately) inclusive, so adjust as necessary.
                if (origEndIndex < input.length - 1 && // not the last character
                    (0, Char_1.isHighSurrogate)(input.charCodeAt(origEndIndex)) && // character is a high surrogate
                    (0, Char_1.isLowSurrogate)(input.charCodeAt(origEndIndex + 1)) // next character is a low surrogate
                ) {
                    origEndIndex++;
                }
                if (!whitelistedIntervals.query(origStartIndex, origEndIndex)) {
                    matches.push({
                        termId: blacklistedTerm.id,
                        startIndex: origStartIndex,
                        endIndex: origEndIndex,
                        matchLength: [...match[0]].length,
                    });
                }
            }
        }
        if (sorted)
            matches.sort(MatchPayload_1.compareMatchByPositionAndId);
        return matches;
    }
    hasMatch(input) {
        const whitelistedIntervals = this.getWhitelistedIntervals(input);
        const [transformedToOrigIndex, transformed] = this.applyTransformers(input, this.blacklistMatcherTransformers);
        for (const blacklistedTerm of this.blacklistedTerms) {
            for (const match of transformed.matchAll(blacklistedTerm.regExp)) {
                const origStartIndex = transformedToOrigIndex[match.index];
                let origEndIndex = transformedToOrigIndex[match.index + match[0].length - 1];
                // End index is (unfortunately) inclusive, so adjust as necessary.
                if (origEndIndex < input.length - 1 && // not the last character
                    (0, Char_1.isHighSurrogate)(input.charCodeAt(origEndIndex)) && // character is a high surrogate
                    (0, Char_1.isLowSurrogate)(input.charCodeAt(origEndIndex + 1)) // next character is a low surrogate
                ) {
                    origEndIndex++;
                }
                if (!whitelistedIntervals.query(origStartIndex, origEndIndex))
                    return true;
            }
        }
        return false;
    }
    getWhitelistedIntervals(input) {
        const matches = new IntervalCollection_1.IntervalCollection();
        const [transformedToOrigIndex, transformed] = this.applyTransformers(input, this.whitelistMatcherTransformers);
        for (const whitelistedTerm of this.whitelistedTerms) {
            let lastEnd = 0;
            for (let startIndex = transformed.indexOf(whitelistedTerm, lastEnd); startIndex !== -1; startIndex = transformed.indexOf(whitelistedTerm, lastEnd)) {
                let origEndIndex = transformedToOrigIndex[startIndex + whitelistedTerm.length - 1];
                // End index is (unfortunately) inclusive, so adjust as necessary.
                if (origEndIndex < input.length - 1 && // not the last character
                    (0, Char_1.isHighSurrogate)(input.charCodeAt(origEndIndex)) && // character is a high surrogate
                    (0, Char_1.isLowSurrogate)(input.charCodeAt(origEndIndex + 1)) // next character is a low surrogate
                ) {
                    origEndIndex++;
                }
                matches.insert(transformedToOrigIndex[startIndex], origEndIndex);
                lastEnd = startIndex + whitelistedTerm.length;
            }
        }
        return matches;
    }
    applyTransformers(input, transformers) {
        const transformedToOrigIndex = [];
        let transformed = '';
        const iter = new CharacterIterator_1.CharacterIterator(input);
        for (const char of iter) {
            const transformedChar = transformers.applyTo(char);
            if (transformedChar !== undefined) {
                transformed += String.fromCodePoint(transformedChar);
                while (transformedToOrigIndex.length < transformed.length)
                    transformedToOrigIndex.push(iter.position);
            }
        }
        transformers.resetAll();
        return [transformedToOrigIndex, transformed];
    }
    compileTerms(terms) {
        const compiled = [];
        const seenIds = new Set();
        for (const term of terms) {
            if (seenIds.has(term.id))
                throw new Error(`Duplicate blacklisted term ID ${term.id}.`);
            if ((0, Util_1.potentiallyMatchesEmptyString)(term.pattern)) {
                throw new Error(`Pattern with ID ${term.id} potentially matches empty string; this is unsupported.`);
            }
            compiled.push({
                id: term.id,
                regExp: (0, Util_1.compilePatternToRegExp)(term.pattern),
            });
            seenIds.add(term.id);
        }
        return compiled;
    }
    validateWhitelistedTerms(whitelist) {
        if (whitelist.some((term) => term.length === 0)) {
            throw new Error('Whitelisted term set contains empty string; this is unsupported.');
        }
    }
}
exports.RegExpMatcher = RegExpMatcher;
