How to create an inverted index of a string with phrase search in C++

1 Answer

0 votes
// An inverted index of a string maps each word to the positions
// where that word appears. This version also supports phrase search.

#include <iostream>
#include <string>
#include <unordered_map>
#include <vector>
#include <sstream> // istringstream

// ------------------------------------------------------------
// cleanWord
// Removes punctuation and converts to lowercase.
// Ensures consistent indexing.
// ------------------------------------------------------------
std::string cleanWord(const std::string& word) {
    std::string cleaned;

    for (char c : word) {
        if (std::isalnum(static_cast<unsigned char>(c))) {
            cleaned += std::tolower(static_cast<unsigned char>(c));
        }
    }

    return cleaned;
}

// ------------------------------------------------------------
// buildInvertedIndex
// Creates an inverted index mapping each word to the positions
// where it appears in the input string.
// ------------------------------------------------------------
std::unordered_map<std::string, std::vector<int>>
buildInvertedIndex(const std::string& text, std::vector<std::string>& orderedWords) {

    std::unordered_map<std::string, std::vector<int>> index;
    std::istringstream iss(text);
    std::string word;
    int position = 0;

    while (iss >> word) {
        std::string cleaned = cleanWord(word);

        if (!cleaned.empty()) {
            orderedWords.push_back(cleaned);
            index[cleaned].push_back(position);
            position++;
        }
    }

    return index;
}

// ------------------------------------------------------------
// phraseSearch
// Searches for a multi-word phrase using the ordered word list.
// Returns all starting positions where the phrase appears.
// ------------------------------------------------------------
std::vector<int> phraseSearch(
    const std::vector<std::string>& orderedWords,
    const std::string& phrase
) {
    std::vector<int> results;

    // Split phrase into cleaned words
    std::istringstream iss(phrase);
    std::string word;
    std::vector<std::string> phraseWords;

    while (iss >> word) {
        std::string cleaned = cleanWord(word);
        if (!cleaned.empty()) {
            phraseWords.push_back(cleaned);
        }
    }

    if (phraseWords.empty()) return results;

    // Scan orderedWords for matching sequences
    for (size_t i = 0; i + phraseWords.size() <= orderedWords.size(); i++) {
        bool match = true;

        for (size_t j = 0; j < phraseWords.size(); j++) {
            if (orderedWords[i + j] != phraseWords[j]) {
                match = false;
                break;
            }
        }

        if (match) {
            results.push_back(static_cast<int>(i));
        }
    }

    return results;
}

int main() {
    std::string text =
        "Imagine all the people living for today. "
        "Imagine there's no heaven. Imagine all the people living for today.";

    // Store words in order for phrase search
    std::vector<std::string> orderedWords;

    // Build the inverted index
    auto index = buildInvertedIndex(text, orderedWords);

    // Print the inverted index
    std::cout << "Inverted Index:\n";
    for (const auto& entry : index) {
        std::cout << entry.first << ": ";
        for (int pos : entry.second) {
            std::cout << pos << " ";
        }
        std::cout << "\n";
    }

    // Phrase search examples
    std::string phrase = "living for today";
    auto positions = phraseSearch(orderedWords, phrase);

    std::cout << "\nPhrase Search: \"" << phrase << "\"\nPositions: ";
    for (int pos : positions) {
        std::cout << pos << " ";
    }
    std::cout << "\n";
}



/*
run:

Inverted Index:
heaven: 10 
no: 9 
today: 6 17 
living: 4 15 
theres: 8 
people: 3 14 
the: 2 13 
all: 1 12 
for: 5 16 
imagine: 0 7 11 

Phrase Search: "living for today"
Positions: 4 15 

*/

 



answered May 10 by avibootz
...