How to find keyword matching between multiple text blocks in TypeScript

1 Answer

0 votes
/*
    Tokenize text into words.
    - Keeps only letters and digits
    - Splits on punctuation and spaces
*/
function tokenize(text: string): Set<string> {
    const words: Set<string> = new Set<string>();
    let word: string = "";

    for (const c of text) {
        if (/[A-Za-z0-9]/.test(c)) {
            word += c.toLowerCase();
        } else if (word.length > 0) {
            words.add(word);
            word = "";
        }
    }

    if (word.length > 0) {
        words.add(word);
    }

    return words;
}

/*
    // Find keyword matches across THREE OR MORE texts
    // -------------------------------------------------------------
    This function receives an array of sets.
    It returns the intersection of ALL sets.
*/
function findMatchesMultiple(allSets: Array<Set<string>>): Set<string> {
    if (allSets.length === 0) {
        return new Set<string>();
    }

    // Start with the first set
    let result: Set<string> = new Set<string>(allSets[0]);

    // Intersect with each remaining set
    for (let i: number = 1; i < allSets.length; i++) {
        const temp: Set<string> = new Set<string>();

        for (const w of result) {
            if (allSets[i].has(w)) {
                temp.add(w);
            }
        }

        result = temp;
    }

    return result;
}

// -------------------------------------------------------------
// Three text blocks to compare
// -------------------------------------------------------------
const text1: string =
    "Machine learning allows computers to learn from data. " +
    "It is widely used in modern applications.";

const text2: string =
    "Data science uses machine learning techniques. " +
    "Applications rely on data-driven models.";

const text3: string =
    "Modern applications of machine learning include data analysis, " +
    "automation, and intelligent systems.";

// -------------------------------------------------------------
// Tokenize all texts
// -------------------------------------------------------------
const words1: Set<string> = tokenize(text1);
const words2: Set<string> = tokenize(text2);
const words3: Set<string> = tokenize(text3);

// Put them into an array for multi-text comparison
const allSets: Array<Set<string>> = [words1, words2, words3];

// -------------------------------------------------------------
// Find keyword matches across ALL texts
// -------------------------------------------------------------
const matches: Set<string> = findMatchesMultiple(allSets);

// -------------------------------------------------------------
// Output results
// -------------------------------------------------------------
console.log("Matched Keywords Across ALL Texts:");
for (const w of matches) {
    console.log(w + " ");
}



/*
run:

Matched Keywords Across ALL Texts:
machine 
learning 
data 
applications 

*/

 



answered 4 hours ago by avibootz

Related questions

...