How to find the longest shared prefix among all words in a string with C

1 Answer

0 votes
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h> // tolower // isalpha

#define MAX_WORDS 1024
#define MAX_PREFIXES 8192

// -----------------------------
// Data structures
// -----------------------------

typedef struct {
    char *prefix;
    char **words;
    size_t count;
} PrefixGroup;

// -----------------------------
// Utility: lowercase a string
// -----------------------------
void to_lower(char *s) {
    for (; *s; ++s)
        *s = (char)tolower(*s);
}

// -----------------------------
// Extract alphabetic words
// -----------------------------
size_t extract_words(const char *text, char **words) {
    size_t count = 0;
    const char *p = text;

    while (*p) {
        // Skip non-alpha
        while (*p && !isalpha((unsigned char)*p))
            p++;

        if (!*p) break;

        const char *start = p;

        // Consume alpha
        while (*p && isalpha((unsigned char)*p))
            p++;

        size_t len = p - start;
        char *w = malloc(len + 1);
        memcpy(w, start, len);
        w[len] = '\0';
        to_lower(w);

        words[count++] = w;
    }

    return count;
}

// -----------------------------
// Add a word to a prefix group
// -----------------------------
void add_word_to_group(PrefixGroup *g, const char *word) {
    g->words = realloc(g->words, (g->count + 1) * sizeof(char *));
    g->words[g->count++] = strdup(word);
}

// -----------------------------
// Find or create prefix group
// -----------------------------
PrefixGroup *find_or_create_group(PrefixGroup *groups, size_t *group_count, const char *prefix) {
    for (size_t i = 0; i < *group_count; i++) {
        if (strcmp(groups[i].prefix, prefix) == 0)
            return &groups[i];
    }

    // Create new group
    groups[*group_count].prefix = strdup(prefix);
    groups[*group_count].words = NULL;
    groups[*group_count].count = 0;

    return &groups[(*group_count)++];
}

// -----------------------------
// Group words by all prefixes
// -----------------------------
size_t group_by_all_prefixes(char **words, size_t word_count, PrefixGroup *groups) {
    size_t group_count = 0;

    for (size_t w = 0; w < word_count; w++) {
        size_t len = strlen(words[w]);

        for (size_t i = 1; i <= len; i++) {
            char prefix[128];
            memcpy(prefix, words[w], i);
            prefix[i] = '\0';

            PrefixGroup *g = find_or_create_group(groups, &group_count, prefix);
            add_word_to_group(g, words[w]);
        }
    }

    return group_count;
}

// -----------------------------
// Find longest prefix with >=2 words
// -----------------------------
PrefixGroup *find_longest_shared_prefix(PrefixGroup *groups, size_t group_count) {
    PrefixGroup *best = NULL;

    for (size_t i = 0; i < group_count; i++) {
        if (groups[i].count >= 2) {
            if (!best || strlen(groups[i].prefix) > strlen(best->prefix))
                best = &groups[i];
        }
    }

    return best;
}

// -----------------------------
// Main
// -----------------------------
int main() {
    const char *text =
        "The Lowly inhabitants of the lowland were surprised to see "
        "the lower branches of the trees.";

    char *words[MAX_WORDS];
    PrefixGroup groups[MAX_PREFIXES];

    size_t word_count = extract_words(text, words);
    size_t group_count = group_by_all_prefixes(words, word_count, groups);

    PrefixGroup *best = find_longest_shared_prefix(groups, group_count);

    if (best) {
        printf("Longest shared prefix:\n");
        printf("%s | prefix_len=%zu | group_count=%zu | [",
               best->prefix, strlen(best->prefix), best->count);

        for (size_t i = 0; i < best->count; i++) {
            printf("%s", best->words[i]);
            if (i + 1 < best->count) printf(", ");
        }
        printf("]\n");
    }

    return 0;
}




/*
run:

Longest shared prefix:
lowl | prefix_len=4 | group_count=2 | [lowly, lowland]

*/

 



answered Mar 13 by avibootz
...