How to get the N top words of a string by occurrences in PHP

1 Answer

0 votes
function remove_word($s, $word) {
    preg_match_all('/\w+/', strtolower($s), $matches);
    $words = $matches[0];
    $new_str = implode(' ', array_filter($words, function($w) use ($word) {
        return $w !== $word;
    }));
    
    return $new_str;
}

function get_top_n_words($s, $n) {
    // Exclude stop words (commonly used words)
    $stop_words = ["is", "a", "to", "as", "now", "by", "on", "and", "the", "it", "was"];
    foreach ($stop_words as $word) {
        $s = remove_word($s, $word);
    }
    
    // Split the string into words
    preg_match_all('/\w+/', strtolower($s), $matches);
    $words = $matches[0];
    
    // Count the occurrences of each word
    $word_count = array_count_values($words);
    
    // Sort the words by their occurrences and get the top N words
    arsort($word_count);
    $top_n_words = array_slice($word_count, 0, $n, true);
    
    return $top_n_words;
}

$s = "PHP is a general-purpose scripting language geared towards 
      web development. It was originally created by Danish-Canadian programmer 
      Rasmus Lerdorf. The PHP reference implementation is now produced by the 
      PHP Group. PHP was originally an abbreviation of Personal Home Page,
      but it now stands for the recursive acronym PHP: Hypertext Preprocessor.";

$n = 5;

$top_n_words = get_top_n_words($s, $n);

foreach ($top_n_words as $key => $value) {
    echo $key . "\n";
}


/*
run:
 
php
originally
general
purpose
scripting
 
*/

 



answered Feb 2 by avibootz
...