import { complexTokenPatterns, complexTokenPlaceholderPrefix } from './constants';

let segmenter: Intl.Segmenter | undefined = undefined;

export function getTokensFromString(
    text: string,
    additonalTokenExclusionPatterns?: RegExp[],
    skipLowerCaseConversionStrings?: string[]
): string[] {
    // Pre-process with regex to handle complex cases which simple segmentation cannot handle
    const complexTokens: string[] = [];
    const complexQueryTokenPatterns = [...complexTokenPatterns];
    if (additonalTokenExclusionPatterns) {
        // Prioritize additional patterns over default ones
        // Notably, KQL tokens should be extracted before quoted keywords, since KQL values may be in quotes
        complexQueryTokenPatterns.unshift(...additonalTokenExclusionPatterns);
    }
    /* eslint-disable-next-line owa-custom-rules/forbid-foreach-with-variables-outside-of-function-scope -- (https://aka.ms/OWALintWiki)
     * https://dev.azure.com/outlookweb/Outlook%20Web/_wiki/wikis/Outlook%20Web.wiki/9650/Use-for-const-loop-of-instead-of-forEach
     *	> When using a forEach function call, avoid using variables outside of the scope of the function, use for (const item of array) instead */
    complexQueryTokenPatterns.forEach((pattern, index) => {
        text = text.replace(pattern, match => {
            const placeholder = `${complexTokenPlaceholderPrefix}${index}_${complexTokens.length}`;
            complexTokens.push(match);
            return placeholder;
        });
    });

    if (!segmenter) {
        segmenter = Intl.Segmenter ? new Intl.Segmenter('en', { granularity: 'word' }) : undefined;
        if (!segmenter) {
            return []; // TODO: fall back to a simpler tokenizer if Intl.segmenter isn't available
        }
    }

    const segments: Intl.Segments = segmenter.segment(text);
    const tokens: string[] = [];
    for (const { segment } of segments) {
        if (
            // spaces
            segment.trim() !== '' &&
            // newline from parser
            segment !== '\n' &&
            // '.', '/', etc
            !isSymbol(segment) &&
            // emoji
            !isEmoji(segment)
        ) {
            if (segment.startsWith(complexTokenPlaceholderPrefix)) {
                const index = parseInt(segment.split('_').pop() || '0'); // get the index of the complex token
                if (complexTokens[index]) {
                    tokens.push(complexTokens[index].toLocaleLowerCase());
                }
            } else if (
                skipLowerCaseConversionStrings &&
                skipLowerCaseConversionStrings.indexOf(segment) !== -1
            ) {
                tokens.push(segment);
            } else {
                tokens.push(segment.toLocaleLowerCase());
            }
        }
    }
    return tokens;
}

function isSymbol(value: string): boolean {
    return value.length === 1 && /\W/.test(value);
}

function isEmoji(value: string): boolean {
    return value.length === 1 && /\p{Emoji}/u.test(value);
}
