#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
/*
Convert a string to lowercase.
*/
void toLower(char *s) {
for (int i = 0; s[i]; i++) {
s[i] = tolower((unsigned char)s[i]);
}
}
/*
Check if a word is already in the list (to simulate std::set)
*/
int contains(char **list, int size, const char *word) {
for (int i = 0; i < size; i++) {
if (strcmp(list[i], word) == 0)
return 1;
}
return 0;
}
/*
Add a word to the list if not already present
*/
void addWord(char ***list, int *size, const char *word) {
if (contains(*list, *size, word))
return;
*list = realloc(*list, (*size + 1) * sizeof(char *));
(*list)[*size] = malloc(strlen(word) + 1);
strcpy((*list)[*size], word);
(*size)++;
}
/*
Tokenize text into words.
- Keeps only letters and digits
- Splits on punctuation and spaces
*/
char **tokenize(const char *text, int *outSize) {
char **words = NULL;
*outSize = 0;
char buffer[256];
int bi = 0;
for (int i = 0; text[i]; i++) {
char c = text[i];
if (isalnum((unsigned char)c)) {
buffer[bi++] = tolower(c);
} else if (bi > 0) {
buffer[bi] = '\0';
addWord(&words, outSize, buffer);
bi = 0;
}
}
if (bi > 0) {
buffer[bi] = '\0';
addWord(&words, outSize, buffer);
}
return words;
}
/*
// Find keyword matches (set intersection)
// -------------------------------------------------------------
This function receives two word lists and returns a new list
containing only the words that appear in BOTH lists.
*/
char **findMatches(char **words1, int size1, char **words2, int size2, int *matchCount) {
char **matches = NULL;
*matchCount = 0;
for (int i = 0; i < size1; i++) {
if (contains(words2, size2, words1[i])) {
addWord(&matches, matchCount, words1[i]);
}
}
return matches;
}
int main() {
// -------------------------------------------------------------
// Two text blocks to compare
// -------------------------------------------------------------
const char *text1 =
"Machine learning allows computers to learn from data. "
"It is widely used in modern applications.";
const char *text2 =
"Data science uses machine learning techniques. "
"Applications rely on data-driven models.";
// -------------------------------------------------------------
// Tokenize both texts
// -------------------------------------------------------------
int size1 = 0, size2 = 0;
char **words1 = tokenize(text1, &size1);
char **words2 = tokenize(text2, &size2);
// -------------------------------------------------------------
// Find keyword matches (set intersection)
// -------------------------------------------------------------
int matchCount = 0;
char **matches = findMatches(words1, size1, words2, size2, &matchCount);
// -------------------------------------------------------------
// Output results
// -------------------------------------------------------------
printf("Keywords in Text 1:\n");
for (int i = 0; i < size1; i++) printf("%s ", words1[i]);
printf("\n\nKeywords in Text 2:\n");
for (int i = 0; i < size2; i++) printf("%s ", words2[i]);
printf("\n\nMatched Keywords:\n");
for (int i = 0; i < matchCount; i++) printf("%s ", matches[i]);
printf("\n");
// Free memory
for (int i = 0; i < size1; i++) free(words1[i]);
for (int i = 0; i < size2; i++) free(words2[i]);
for (int i = 0; i < matchCount; i++) free(matches[i]);
free(words1);
free(words2);
free(matches);
return 0;
}
/*
run:
Keywords in Text 1:
machine learning allows computers to learn from data it is widely used in modern applications
Keywords in Text 2:
data science uses machine learning techniques applications rely on driven models
Matched Keywords:
machine learning data applications
*/