How to find keyword matching between 2 text blocks in C

1 Answer

0 votes
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

/*
    Convert a string to lowercase.
*/
void toLower(char *s) {
    for (int i = 0; s[i]; i++) {
        s[i] = tolower((unsigned char)s[i]);
    }
}

/*
    Check if a word is already in the list (to simulate std::set)
*/
int contains(char **list, int size, const char *word) {
    for (int i = 0; i < size; i++) {
        if (strcmp(list[i], word) == 0)
            return 1;
    }
    return 0;
}

/*
    Add a word to the list if not already present
*/
void addWord(char ***list, int *size, const char *word) {
    if (contains(*list, *size, word))
        return;

    *list = realloc(*list, (*size + 1) * sizeof(char *));
    (*list)[*size] = malloc(strlen(word) + 1);
    strcpy((*list)[*size], word);
    (*size)++;
}

/*
    Tokenize text into words.
    - Keeps only letters and digits
    - Splits on punctuation and spaces
*/
char **tokenize(const char *text, int *outSize) {
    char **words = NULL;
    *outSize = 0;

    char buffer[256];
    int bi = 0;

    for (int i = 0; text[i]; i++) {
        char c = text[i];

        if (isalnum((unsigned char)c)) {
            buffer[bi++] = tolower(c);
        } else if (bi > 0) {
            buffer[bi] = '\0';
            addWord(&words, outSize, buffer);
            bi = 0;
        }
    }

    if (bi > 0) {
        buffer[bi] = '\0';
        addWord(&words, outSize, buffer);
    }

    return words;
}

/*
    // Find keyword matches (set intersection)
    // -------------------------------------------------------------
    This function receives two word lists and returns a new list
    containing only the words that appear in BOTH lists.
*/
char **findMatches(char **words1, int size1, char **words2, int size2, int *matchCount) {
    char **matches = NULL;
    *matchCount = 0;

    for (int i = 0; i < size1; i++) {
        if (contains(words2, size2, words1[i])) {
            addWord(&matches, matchCount, words1[i]);
        }
    }

    return matches;
}

int main() {
    // -------------------------------------------------------------
    // Two text blocks to compare
    // -------------------------------------------------------------
    const char *text1 =
        "Machine learning allows computers to learn from data. "
        "It is widely used in modern applications.";

    const char *text2 =
        "Data science uses machine learning techniques. "
        "Applications rely on data-driven models.";

    // -------------------------------------------------------------
    // Tokenize both texts
    // -------------------------------------------------------------
    int size1 = 0, size2 = 0;
    char **words1 = tokenize(text1, &size1);
    char **words2 = tokenize(text2, &size2);

    // -------------------------------------------------------------
    // Find keyword matches (set intersection)
    // -------------------------------------------------------------
    int matchCount = 0;
    char **matches = findMatches(words1, size1, words2, size2, &matchCount);

    // -------------------------------------------------------------
    // Output results
    // -------------------------------------------------------------
    printf("Keywords in Text 1:\n");
    for (int i = 0; i < size1; i++) printf("%s ", words1[i]);

    printf("\n\nKeywords in Text 2:\n");
    for (int i = 0; i < size2; i++) printf("%s ", words2[i]);

    printf("\n\nMatched Keywords:\n");
    for (int i = 0; i < matchCount; i++) printf("%s ", matches[i]);

    printf("\n");

    // Free memory
    for (int i = 0; i < size1; i++) free(words1[i]);
    for (int i = 0; i < size2; i++) free(words2[i]);
    for (int i = 0; i < matchCount; i++) free(matches[i]);

    free(words1);
    free(words2);
    free(matches);

    return 0;
}



/*
run:

Keywords in Text 1:
machine learning allows computers to learn from data it is widely used in modern applications 

Keywords in Text 2:
data science uses machine learning techniques applications rely on driven models 

Matched Keywords:
machine learning data applications 

*/

 



answered 1 hour ago by avibootz
...