How to find keyword matching between multiple text blocks in Python

1 Answer

0 votes
"""
    Tokenize text into words.
    - Keeps only letters and digits
    - Splits on punctuation and spaces
"""
def tokenize(text: str) -> set:
    words = set()
    word = ""

    for c in text:
        if c.isalnum():
            word += c.lower()
        elif word:
            words.add(word)
            word = ""

    if word:
        words.add(word)

    return words

"""
    // Find keyword matches across THREE OR MORE texts
    // -------------------------------------------------------------
    This function receives a list of sets.
    It returns the intersection of ALL sets.
"""
def findMatchesMultiple(allSets: list) -> set:
    if not allSets:
        return set()

    # Start with the first set
    result = set(allSets[0])

    # Intersect with each remaining set
    for s in allSets[1:]:
        temp = set()

        for w in result:
            if w in s:
                temp.add(w)

        result = temp

    return result


def main():
    # -------------------------------------------------------------
    # Three text blocks to compare
    # -------------------------------------------------------------
    text1 = (
        "Machine learning allows computers to learn from data. "
        "It is widely used in modern applications."
    )

    text2 = (
        "Data science uses machine learning techniques. "
        "Applications rely on data-driven models."
    )

    text3 = (
        "Modern applications of machine learning include data analysis, "
        "automation, and intelligent systems."
    )

    # -------------------------------------------------------------
    # Tokenize all texts
    # -------------------------------------------------------------
    words1 = tokenize(text1)
    words2 = tokenize(text2)
    words3 = tokenize(text3)

    # Put them into a list for multi-text comparison
    allSets = [words1, words2, words3]

    # -------------------------------------------------------------
    # Find keyword matches across ALL texts
    # -------------------------------------------------------------
    matches = findMatchesMultiple(allSets)

    # -------------------------------------------------------------
    # Output results
    # -------------------------------------------------------------
    print("Matched Keywords Across ALL Texts:")
    for w in matches:
        print(w, end=" ")


if __name__ == "__main__":
    main()



"""
run:

Matched Keywords Across ALL Texts:
applications data learning machine 

"""

 



answered 9 hours ago by avibootz
edited 9 hours ago by avibootz

Related questions

...