How to get the N top words of a string by occurrences in Rust

1 Answer

0 votes
use regex::Regex;
use std::collections::HashMap;

fn get_top_n_words(s: &str, n: usize) -> Vec<(String, i32)> {
    let re = Regex::new(r"\w+").unwrap();
    // Split the string into words
    let words: Vec<String> = re.find_iter(&s.to_lowercase()) // Directly iterate over the lowercase String
            .map(|mat| mat.as_str().to_string()) // Convert &str to String
            .collect();

    let mut word_count: HashMap<String, i32> = HashMap::new();
    for word in words {
        // Exclude stop words (commonly used words)
        if word == "a" || word == "and" || word == "it" || word == "the" {
            continue;
        }
        *word_count.entry(word).or_insert(0) += 1;
    }

    let mut word_vec: Vec<(String, i32)> = word_count.into_iter().collect();
    word_vec.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));

    word_vec.into_iter().take(n).collect()
}

fn main() {
    let s = "Rust is a general-purpose programming language \
             emphasizing performance, type safety, and concurrency. \
             It enforces memory safety, meaning that all references \
             point to valid memory. Rust does so without a traditional \
             garbage collector; instead, memory safety errors \
             and data races are prevented by the borrow checker, \
             which tracks the object lifetime of references at compile time.";

    let n = 4;
    let top_n_words = get_top_n_words(s, n);

    for (word, count) in top_n_words {
        println!("{}: {}", word, count);
    }
}

   
/*
run:

memory: 3
safety: 3
references: 2
rust: 2
  
*/

 



answered Feb 3 by avibootz
...