How to find keyword matching between 2 text blocks in C++

1 Answer

0 votes
#include <iostream>
#include <set>
#include <algorithm> // transform
#include <cctype>

/*
    Tokenize text into words.
    - Keeps only letters and digits
    - Splits on punctuation and spaces
*/
std::set<std::string> tokenize(const std::string &text) {
    std::set<std::string> words;
    std::string word;
 
    for (char c : text) {
        if (std::isalnum(c)) {
            word += std::tolower(c);
        } else if (!word.empty()) {
            words.insert(word);
            word.clear();
        }
    }
    if (!word.empty()) words.insert(word);
 
    return words;
}
 
/*
    // Find keyword matches (set intersection)
    // -------------------------------------------------------------
    This function receives two sets of words and returns a new set
    containing only the words that appear in BOTH sets.
*/
std::set<std::string> findMatches(const std::set<std::string> &words1,
                                  const std::set<std::string> &words2)
{
    std::set<std::string> matches;
 
    for (const std::string &w : words1) {
        if (words2.count(w)) {
            matches.insert(w);
        }
    }
 
    return matches;
}
 
int main() {
    // -------------------------------------------------------------
    // Two text blocks to compare
    // -------------------------------------------------------------
    std::string text1 =
        "Machine learning allows computers to learn from data. "
        "It is widely used in modern applications.";
 
    std::string text2 =
        "Data science uses machine learning techniques. "
        "Applications rely on data-driven models.";
 
    // -------------------------------------------------------------
    // Tokenize both texts
    // -------------------------------------------------------------
    std::set<std::string> words1 = tokenize(text1);
    std::set<std::string> words2 = tokenize(text2);
 
    // -------------------------------------------------------------
    // Find keyword matches (set intersection)
    // -------------------------------------------------------------
    std::set<std::string> matches = findMatches(words1, words2);
 
    // -------------------------------------------------------------
    // Output results
    // -------------------------------------------------------------
    std::cout << "Keywords in Text 1:\n";
    for (const std::string &w : words1) std::cout << w << " ";
 
    std::cout << "\n\nKeywords in Text 2:\n";
    for (const std::string &w : words2) std::cout << w << " ";
 
    std::cout << "\n\nMatched Keywords:\n";
    for (const std::string &w : matches) std::cout << w << " ";
 
    std::cout << "\n";
}
 
 
 
/*
run:
 
Keywords in Text 1:
allows applications computers data from in is it learn learning machine modern to used widely 
 
Keywords in Text 2:
applications data driven learning machine models on rely science techniques uses 
 
Matched Keywords:
applications data learning machine 
 
*/

 



answered May 21 by avibootz
edited May 22 by avibootz

Related questions

...