How to find keyword matching between multiple text blocks in Java

1 Answer

0 votes
import java.util.HashSet;
import java.util.ArrayList;
import java.util.Set;
import java.util.List;

public class KeywordMatching {

    /**
        Tokenize text into words.
        - Keeps only letters and digits
        - Splits on punctuation and spaces
    */
    public static Set<String> tokenize(String text) {
        Set<String> words = new HashSet<>();
        StringBuilder word = new StringBuilder();

        for (char c : text.toCharArray()) {
            if (Character.isLetterOrDigit(c)) {
                word.append(Character.toLowerCase(c));
            } else if (word.length() > 0) {
                words.add(word.toString());
                word.setLength(0);
            }
        }

        if (word.length() > 0) {
            words.add(word.toString());
        }

        return words;
    }

    /**
        // Find keyword matches across THREE OR MORE texts
        // -------------------------------------------------------------
        This function receives a vector of sets.
        It returns the intersection of ALL sets.
    */
    public static Set<String> findMatchesMultiple(List<Set<String>> allSets) {
        if (allSets.isEmpty()) return new HashSet<>();

        // Start with the first set
        Set<String> result = new HashSet<>(allSets.get(0));

        // Intersect with each remaining set
        for (int i = 1; i < allSets.size(); i++) {
            Set<String> temp = new HashSet<>();

            for (String w : result) {
                if (allSets.get(i).contains(w)) {
                    temp.add(w);
                }
            }

            result = temp;
        }

        return result;
    }

    public static void main(String[] args) {

        // -------------------------------------------------------------
        // Three text blocks to compare
        // -------------------------------------------------------------
        String text1 =
            "Machine learning allows computers to learn from data. " +
            "It is widely used in modern applications.";

        String text2 =
            "Data science uses machine learning techniques. " +
            "Applications rely on data-driven models.";

        String text3 =
            "Modern applications of machine learning include data analysis, " +
            "automation, and intelligent systems.";

        // -------------------------------------------------------------
        // Tokenize all texts
        // -------------------------------------------------------------
        Set<String> words1 = tokenize(text1);
        Set<String> words2 = tokenize(text2);
        Set<String> words3 = tokenize(text3);

        // Put them into a vector for multi-text comparison
        List<Set<String>> allSets = new ArrayList<>();
        allSets.add(words1);
        allSets.add(words2);
        allSets.add(words3);

        // -------------------------------------------------------------
        // Find keyword matches across ALL texts
        // -------------------------------------------------------------
        Set<String> matches = findMatchesMultiple(allSets);

        // -------------------------------------------------------------
        // Output results
        // -------------------------------------------------------------
        System.out.println("Matched Keywords Across ALL Texts:");
        for (String w : matches)
            System.out.print(w + " ");
    }
}


/*
run:

Matched Keywords Across ALL Texts:
data machine learning applications 

*/

 



answered 9 hours ago by avibootz
edited 9 hours ago by avibootz

Related questions

...