How to find keyword matching between multiple text blocks in VB.NET

1 Answer

0 votes
Imports System
Imports System.Collections.Generic

Module KeywordMatching

    '
    '    Tokenize text into words.
    '    - Keeps only letters and digits
    '    - Splits on punctuation and spaces
    '
    Function Tokenize(text As String) As HashSet(Of String)
        Dim words As New HashSet(Of String)()
        Dim word As String = ""

        For Each c As Char In text
            If Char.IsLetterOrDigit(c) Then
                word &= Char.ToLower(c)
            ElseIf word.Length > 0 Then
                words.Add(word)
                word = ""
            End If
        Next

        If word.Length > 0 Then
            words.Add(word)
        End If

        Return words
    End Function

    '
    '    // Find keyword matches across THREE OR MORE texts
    '    // -------------------------------------------------------------
    '    This function receives a list of sets.
    '    It returns the intersection of ALL sets.
    '
    Function FindMatchesMultiple(allSets As List(Of HashSet(Of String))) As HashSet(Of String)
        If allSets.Count = 0 Then
            Return New HashSet(Of String)()
        End If

        ' Start with the first set
        Dim result As New HashSet(Of String)(allSets(0))

        ' Intersect with each remaining set
        For i As Integer = 1 To allSets.Count - 1
            Dim temp As New HashSet(Of String)()

            For Each w As String In result
                If allSets(i).Contains(w) Then
                    temp.Add(w)
                End If
            Next

            result = temp
        Next

        Return result
    End Function

    Sub Main()

        ' -------------------------------------------------------------
        ' Three text blocks to compare
        ' -------------------------------------------------------------
        Dim text1 As String =
            "Machine learning allows computers to learn from data. " &
            "It is widely used in modern applications."

        Dim text2 As String =
            "Data science uses machine learning techniques. " &
            "Applications rely on data-driven models."

        Dim text3 As String =
            "Modern applications of machine learning include data analysis, " &
            "automation, and intelligent systems."

        ' -------------------------------------------------------------
        ' Tokenize all texts
        ' -------------------------------------------------------------
        Dim words1 = Tokenize(text1)
        Dim words2 = Tokenize(text2)
        Dim words3 = Tokenize(text3)


        ' Put them into a list for multi-text comparison
        Dim allSets As New List(Of HashSet(Of String)) From {words1, words2, words3}

        ' -------------------------------------------------------------
        ' Find keyword matches across ALL texts
        ' -------------------------------------------------------------
        Dim matches = FindMatchesMultiple(allSets)

        ' -------------------------------------------------------------
        ' Output results
        ' -------------------------------------------------------------
        Console.WriteLine("Matched Keywords Across ALL Texts:")
        For Each w As String In matches
            Console.Write(w & " ")
        Next

        Console.WriteLine()
    End Sub

End Module


'
' run:
'
' Matched Keywords Across ALL Texts:
' applications data learning machine 
'

 



answered 9 hours ago by avibootz

Related questions

...