How to get the N top words of a string by occurrences in Scala

1 Answer

0 votes
import scala.collection.mutable
import scala.util.matching.Regex

def removeWord(s: String, word: String): String = {
  val re: Regex = "\\w+".r
  val words: Seq[String] = re.findAllIn(s.toLowerCase).toSeq
  val newWords: Seq[String] = words.filter(_ != word)
  
  newWords.mkString(" ")
}

def getTopNWords(s: String, n: Int): Map[String, Int] = {
  // Exclude stop words (commonly used words)
  val stopWords: Set[String] = Set("is", "a", "to", "as", "for", "of",
                  "on", "and", "the", "alongside", "also", "are", "be")
  
  var processedStr = s
  stopWords.foreach { word =>
    processedStr = removeWord(processedStr, word)
  }

  // Split the string into words
  val re: Regex = "\\w+".r
  val words: Seq[String] = re.findAllIn(processedStr.toLowerCase).toSeq

  // Count the occurrences of each word
  val wordCount = mutable.Map[String, Int]()
  words.foreach { word =>
    wordCount(word) = wordCount.getOrElse(word, 0) + 1
  }

  // Sort the words by their occurrences and get the top N words
  val topNWords = wordCount.toSeq
    .sortWith((a, b) => if (a._2 == b._2) a._1 < b._1 else a._2 > b._2)
    .take(n)
    .toMap

  topNWords
}

object Main extends App {
  val s = "Scala is a strong statically typed high-level general-purpose " +
          "programming language that supports both object-oriented programming " +
          "and functional programming. Designed to be concise, many of Scala " + 
          "design decisions are intended to address criticisms of Java.";

  val n = 4

  val topNWords = getTopNWords(s, n)

  topNWords.keys.foreach(println)
}

   
   
/*
run:
     
programming
scala
address
both
 
*/

 



answered Feb 3 by avibootz
edited Feb 3 by avibootz
...