How to create an inverted index of a string in C

1 Answer

0 votes
// An inverted index of a string is building a data structure that
// maps each word to positions where that word appears.

#include <stdio.h>
#include <string.h>
#include <ctype.h>

#define MAX_WORDS 256
#define MAX_WORD_LEN 64

// ------------------------------------------------------------
// cleanWord
// Removes punctuation and converts to lowercase.
// This ensures consistent indexing.
// ------------------------------------------------------------
void cleanWord(const char *input, char *output) {
    int j = 0;

    for (int i = 0; input[i] != '\0'; i++) {
        if (isalnum((unsigned char)input[i])) {
            output[j++] = tolower((unsigned char)input[i]);
        }
    }

    output[j] = '\0';
}

// ------------------------------------------------------------
// findWordIndex
// Returns index of word in dictionary, or -1 if not found.
// ------------------------------------------------------------
int findWordIndex(char words[][MAX_WORD_LEN], int wordCount, const char *word) {
    for (int i = 0; i < wordCount; i++) {
        if (strcmp(words[i], word) == 0) {
            return i;
        }
    }
    
    return -1;
}

// ------------------------------------------------------------
// buildInvertedIndex
// Builds an inverted index using simple arrays.
// words[] holds unique words.
// positions[][] holds lists of positions for each word.
// ------------------------------------------------------------
int buildInvertedIndex(
    const char *text,
    char words[][MAX_WORD_LEN],
    int positions[][MAX_WORDS],
    int counts[]
) {
    char buffer[1024] = "";
    strcpy(buffer, text);

    int wordCount = 0;
    int position = 0;

    char *token = strtok(buffer, " ");
    while (token != NULL) {

        char cleaned[MAX_WORD_LEN];
        cleanWord(token, cleaned);

        if (cleaned[0] != '\0') {
            int idx = findWordIndex(words, wordCount, cleaned);

            if (idx == -1) {
                // New word
                strcpy(words[wordCount], cleaned);
                positions[wordCount][counts[wordCount]++] = position;
                wordCount++;
            } else {
                // Existing word
                positions[idx][counts[idx]++] = position;
            }
        }

        position++;
        token = strtok(NULL, " ");
    }

    return wordCount;
}

int main() {
    const char text[] =
        "Imagine all the people living for today. "
        "Imagine there's no heaven. Imagine all the people.";

    char words[MAX_WORDS][MAX_WORD_LEN] = {0};
    int positions[MAX_WORDS][MAX_WORDS] = {0};
    int counts[MAX_WORDS] = {0};

    int wordCount = buildInvertedIndex(text, words, positions, counts);

    printf("Inverted Index:\n");

    for (int i = 0; i < wordCount; i++) {
        printf("%s: ", words[i]);
        for (int j = 0; j < counts[i]; j++) {
            printf("%d ", positions[i][j]);
        }
        printf("\n");
    }

    return 0;
}


/*
run:

Inverted Index:
imagine: 0 7 11 
all: 1 12 
the: 2 13 
people: 3 14 
living: 4 
for: 5 
today: 6 
theres: 8 
no: 9 
heaven: 10 

*/

 



answered May 10 by avibootz
...