How to find keyword matching between multiple text blocks in Scala

1 Answer

0 votes
import scala.collection.mutable

/*
    Tokenize text into words.
    - Keeps only letters and digits
    - Splits on punctuation and spaces
*/
def tokenize(text: String): Set[String] = {
  val words = mutable.Set[String]()
  val sb = new StringBuilder

  for (c <- text) {
    if (c.isLetterOrDigit) {
      sb.append(c.toLower)
    } else if (sb.nonEmpty) {
      words += sb.toString
      sb.clear()
    }
  }

  if (sb.nonEmpty)
    words += sb.toString

  words.toSet
}

/*
    // Find keyword matches across THREE OR MORE texts
    // -------------------------------------------------------------
    This function receives a vector of sets.
    It returns the intersection of ALL sets.
*/
def findMatchesMultiple(allSets: Vector[Set[String]]): Set[String] = {
  if (allSets.isEmpty) return Set.empty

  // Start with the first set
  var result: Set[String] = allSets.head

  // Intersect with each remaining set
  for (i <- 1 until allSets.length) {
    val temp = mutable.Set[String]()

    for (w <- result) {
      if (allSets(i).contains(w)) {
        temp += w
      }
    }

    result = temp.toSet
  }

  result
}

@main def main(): Unit = {

  // -------------------------------------------------------------
  // Three text blocks to compare
  // -------------------------------------------------------------
  val text1 =
    "Machine learning allows computers to learn from data. " +
    "It is widely used in modern applications."

  val text2 =
    "Data science uses machine learning techniques. " +
    "Applications rely on data-driven models."

  val text3 =
    "Modern applications of machine learning include data analysis, " +
    "automation, and intelligent systems."

  // -------------------------------------------------------------
  // Tokenize all texts
  // -------------------------------------------------------------
  val words1 = tokenize(text1)
  val words2 = tokenize(text2)
  val words3 = tokenize(text3)

  // Put them into a vector for multi-text comparison
  val allSets = Vector(words1, words2, words3)

  // -------------------------------------------------------------
  // Find keyword matches across ALL texts
  // -------------------------------------------------------------
  val matches = findMatchesMultiple(allSets)

  // -------------------------------------------------------------
  // Output results
  // -------------------------------------------------------------
  println("Matched Keywords Across ALL Texts:")
  matches.foreach(w => print(s"$w "))
}



/*
run:

Matched Keywords Across ALL Texts:
data machine learning applications 

*/

 



answered 4 hours ago by avibootz

Related questions

...