How to get the N top words of a string by occurrences in Go

1 Answer

0 votes
package main

import (
    "fmt"
    "regexp"
    "sort"
    "strings"
)

func removeWord(s, word string) string {
    re := regexp.MustCompile(`\w+`)
    words := re.FindAllString(strings.ToLower(s), -1)
    
    var newWords []string
    for _, w := range words {
        if w != word {
            newWords = append(newWords, w)
        }
    }
    
    return strings.Join(newWords, " ")
}

func getTopNWords(s string, n int) map[string]int {
    // Exclude stop words (commonly used words)
    stopWords := []string{"is", "a", "to", "as", "for", "of", "at", "it",
                          "by", "on", "and", "the", "alongside", "also"}

    for _, word := range stopWords {
        s = removeWord(s, word)
    }

    // Split the string into words
    re := regexp.MustCompile(`\w+`)
    words := re.FindAllString(strings.ToLower(s), -1)

    wordCount := make(map[string]int)
    for _, word := range words {
        wordCount[word]++
    }

    type kv struct {
        Key   string
        Value int
    }

    // Sort the words by their occurrences and get the top N words
    var sortedWordCount []kv
    for k, v := range wordCount {
        sortedWordCount = append(sortedWordCount, kv{k, v})
    }

    sort.Slice(sortedWordCount, func(i, j int) bool {
        if sortedWordCount[i].Value == sortedWordCount[j].Value {
            return sortedWordCount[i].Key < sortedWordCount[j].Key
        }
        return sortedWordCount[i].Value > sortedWordCount[j].Value
    })

    topNWords := make(map[string]int)
    for i := 0; i < n && i < len(sortedWordCount); i++ {
        topNWords[sortedWordCount[i].Key] = sortedWordCount[i].Value
    }

    return topNWords
}

func main() {
    s := "Go is a statically typed, compiled high-level " + 
         "general purpose programming language. It is known for " + 
         "the simplicity of its syntax and the efficiency of development " + 
         "that it enables by the inclusion of a large standard library " + 
         "supplying many needs for common projects. Go was designed at " + 
         "Google in 2009 by Robert Griesemer, Rob Pike, and Ken Thompson."

    n := 5

    topNWords := getTopNWords(s, n)

    for key := range topNWords {
        fmt.Println(key)
    }
}



/*
run:

go
2009
common
compiled
designed

*/

 



answered Feb 3 by avibootz
...