require 'set'
#
# Tokenize text into words.
# - Keeps only letters and digits
# - Splits on punctuation and spaces
#
def tokenize(text)
words = Set.new
word = ""
text.each_char do |c|
if c.match?(/[A-Za-z0-9]/)
word << c.downcase
elsif !word.empty?
words.add(word)
word = ""
end
end
words.add(word) unless word.empty?
words
end
#
# // Find keyword matches across THREE OR MORE texts
# // -------------------------------------------------------------
# This function receives a vector of sets.
# It returns the intersection of ALL sets.
#
def find_matches_multiple(all_sets)
return Set.new if all_sets.empty?
# Start with the first set
result = all_sets.first.dup
# Intersect with each remaining set
all_sets[1..].each do |s|
temp = Set.new
result.each do |w|
temp.add(w) if s.include?(w)
end
result = temp
end
result
end
#
# -------------------------------------------------------------
# Three text blocks to compare
# -------------------------------------------------------------
#
text1 =
"Machine learning allows computers to learn from data. " \
"It is widely used in modern applications."
text2 =
"Data science uses machine learning techniques. " \
"Applications rely on data-driven models."
text3 =
"Modern applications of machine learning include data analysis, " \
"automation, and intelligent systems."
#
# -------------------------------------------------------------
# Tokenize all texts
# -------------------------------------------------------------
#
words1 = tokenize(text1)
words2 = tokenize(text2)
words3 = tokenize(text3)
# Put them into a vector for multi-text comparison
all_sets = [words1, words2, words3]
#
# -------------------------------------------------------------
# Find keyword matches across ALL texts
# -------------------------------------------------------------
#
matches = find_matches_multiple(all_sets)
#
# -------------------------------------------------------------
# Output results
# -------------------------------------------------------------
#
puts "Matched Keywords Across ALL Texts:"
matches.each { |w| print "#{w} " }
#
# run:
#
# Matched Keywords Across ALL Texts:
# applications data learning machine
#