How to find keyword matching between multiple text blocks in Pascal

1 Answer

0 votes
program KeywordMatching;

{$mode objfpc}{$H+}

type
  TStringSet = array of string;

(*
    Tokenize text into words.
    - Keeps only letters and digits
    - Splits on punctuation and spaces
*)
function tokenize(const text: string): TStringSet;
var
  words: TStringSet;
  size: integer = 0;
  buffer: string = '';
  i: integer;
  c: char;
begin
  SetLength(words, 0);

  for i := 1 to Length(text) do
  begin
    c := text[i];

    if (c in ['A'..'Z','a'..'z','0'..'9']) then
      buffer := buffer + LowerCase(c)
    else if buffer <> '' then
    begin
      SetLength(words, size + 1);
      words[size] := buffer;
      Inc(size);
      buffer := '';
    end;
  end;

  if buffer <> '' then
  begin
    SetLength(words, size + 1);
    words[size] := buffer;
  end;

  Result := words;
end;

(*
    Helper: check if a word exists in a set
*)
function contains(const words: TStringSet; const w: string): boolean;
var
  i: integer;
begin
  Result := False;
  for i := 0 to High(words) do
    if words[i] = w then
      Exit(True);
end;

(*
    // Find keyword matches (set intersection) for TWO texts
    // -------------------------------------------------------------
*)
function findMatches(const words1, words2: TStringSet): TStringSet;
var
  matches: TStringSet;
  size: integer = 0;
  i: integer;
begin
  SetLength(matches, 0);

  for i := 0 to High(words1) do
    if contains(words2, words1[i]) then
    begin
      SetLength(matches, size + 1);
      matches[size] := words1[i];
      Inc(size);
    end;

  Result := matches;
end;

(*
    // Find keyword matches across THREE OR MORE texts
    // -------------------------------------------------------------
    This function receives an array of sets.
    It returns the intersection of ALL sets.
*)
function findMatchesMultiple(const allSets: array of TStringSet): TStringSet;
var
  resultSet, temp: TStringSet;
  size, i, j: integer;
begin
  if Length(allSets) = 0 then Exit(nil);

  (* Start with the first set *)
  resultSet := allSets[0];

  (* Intersect with each remaining set *)
  for i := 1 to High(allSets) do
  begin
    SetLength(temp, 0);
    size := 0;

    for j := 0 to High(resultSet) do
      if contains(allSets[i], resultSet[j]) then
      begin
        SetLength(temp, size + 1);
        temp[size] := resultSet[j];
        Inc(size);
      end;

    resultSet := temp;
  end;

  Result := resultSet;
end;

var
  text1, text2, text3: string;
  words1, words2, words3, matches: TStringSet;
  i: integer;

begin
  // -------------------------------------------------------------
  // Three text blocks to compare
  // -------------------------------------------------------------
  text1 :=
    'Machine learning allows computers to learn from data. ' +
    'It is widely used in modern applications.';

  text2 :=
    'Data science uses machine learning techniques. ' +
    'Applications rely on data-driven models.';

  text3 :=
    'Modern applications of machine learning include data analysis, ' +
    'automation, and intelligent systems.';

  // -------------------------------------------------------------
  // Tokenize all texts
  // -------------------------------------------------------------
  words1 := tokenize(text1);
  words2 := tokenize(text2);
  words3 := tokenize(text3);

  // -------------------------------------------------------------
  // Find keyword matches across ALL texts
  // -------------------------------------------------------------
  matches := findMatchesMultiple([words1, words2, words3]);

  // -------------------------------------------------------------
  // Output results
  // -------------------------------------------------------------
  WriteLn('Matched Keywords Across ALL Texts:');
  for i := 0 to High(matches) do
    Write(matches[i], ' ');
end.



(*
run:

Matched Keywords Across ALL Texts:
machine learning data applications 

*)

 



answered 5 hours ago by avibootz

Related questions

...