How to find keyword matching between 2 text blocks in Java

1 Answer

0 votes
import java.util.HashSet;
import java.util.Set;

public class KeywordMatching {

    /**
        Tokenize text into words.
        - Keeps only letters and digits
        - Splits on punctuation and spaces
    */
    Set<String> tokenize(String text) {
        Set<String> words = new HashSet<>();
        StringBuilder word = new StringBuilder();
    
        for (char c : text.toCharArray()) {
            if (Character.isLetterOrDigit(c)) {
                word.append(Character.toLowerCase(c));
            } else if (word.length() > 0) {
                words.add(word.toString());
                word.setLength(0);
            }
        }
    
        if (word.length() > 0)
            words.add(word.toString());
    
        return words;
    }
    
    /**
        // Find keyword matches (set intersection)
        // -------------------------------------------------------------
        This function receives two sets of words and returns a new set
        containing only the words that appear in BOTH sets.
    */
    Set<String> findMatches(Set<String> words1, Set<String> words2) {
        Set<String> matches = new HashSet<>();
    
        for (String w : words1) {
            if (words2.contains(w)) {
                matches.add(w);
            }
        }
    
        return matches;
    }
 
    public static void main(String[] args) {

        KeywordMatching km = new KeywordMatching();

        // -------------------------------------------------------------
        // Two text blocks to compare
        // -------------------------------------------------------------
        String text1 =
            "Machine learning allows computers to learn from data. " +
            "It is widely used in modern applications.";

        String text2 =
            "Data science uses machine learning techniques. " +
            "Applications rely on data-driven models.";

        // -------------------------------------------------------------
        // Tokenize both texts
        // -------------------------------------------------------------
        Set<String> words1 = km.tokenize(text1);
        Set<String> words2 = km.tokenize(text2);

        // -------------------------------------------------------------
        // Find keyword matches (set intersection)
        // -------------------------------------------------------------
        Set<String> matches = km.findMatches(words1, words2);

        // -------------------------------------------------------------
        // Output results
        // -------------------------------------------------------------
        System.out.println("Keywords in Text 1:");
        for (String w : words1) System.out.print(w + " ");

        System.out.println("\n\nKeywords in Text 2:");
        for (String w : words2) System.out.print(w + " ");

        System.out.println("\n\nMatched Keywords:");
        for (String w : matches) System.out.print(w + " ");
    }
}



/*
run:

Keywords in Text 1:
allows data learn in widely learning is it used modern machine from to computers applications 

Keywords in Text 2:
models data rely machine techniques science driven uses learning applications on 

Matched Keywords:
data machine learning applications 

*/

 



answered 5 hours ago by avibootz

Related questions

...