How to find keyword matching between 2 text blocks in Pascal

1 Answer

0 votes
program KeywordMatching;

{$mode objfpc}{$H+}

type
  TStringListDyn = array of string;


(*
    Check if a word is already in the list (simulating std::set)
*)
function contains(const list: TStringListDyn; const word: string): boolean;
var
  i: integer;
begin
  Result := False;
  for i := 0 to High(list) do
    if list[i] = word then
      Exit(True);
end;

(*
    Add a word to the list if not already present
*)
procedure addWord(var list: TStringListDyn; var size: integer; const word: string);
begin
  if contains(list, word) then Exit;

  SetLength(list, size + 1);  
  list[size] := word;
  Inc(size);
end;

(*
    Tokenize text into words.
    - Keeps only letters and digits
    - Splits on punctuation and spaces
*)
function tokenize(const text: string): TStringListDyn;
var
  words: TStringListDyn;
  size: integer = 0;
  buffer: string = '';
  i: integer;
  c: char;
begin
  SetLength(words, 0);

  for i := 1 to Length(text) do
  begin
    c := text[i];

    (* FIXED: Free Pascal does not support c.IsLetterOrDigit *)
    if (c in ['A'..'Z','a'..'z','0'..'9']) then
      buffer := buffer + LowerCase(c)
    else if buffer <> '' then
    begin
      addWord(words, size, buffer);
      buffer := '';
    end;
  end;

  if buffer <> '' then
    addWord(words, size, buffer);

  Result := words;
end;


(*
    // Find keyword matches (set intersection)
    // -------------------------------------------------------------
    This function receives two word lists and returns a new list
    containing only the words that appear in BOTH lists.
*)
function findMatches(const words1, words2: TStringListDyn): TStringListDyn;
var
  matches: TStringListDyn;
  size: integer = 0;
  i: integer;
begin
  SetLength(matches, 0);

  for i := 0 to High(words1) do
    if contains(words2, words1[i]) then
      addWord(matches, size, words1[i]);

  Result := matches;
end;

var
  text1, text2: string;
  words1, words2, matches: TStringListDyn;
  i: integer;

begin
  (*
      Two text blocks to compare
  *)
  text1 :=
    'Machine learning allows computers to learn from data. ' +
    'It is widely used in modern applications.';

  text2 :=
    'Data science uses machine learning techniques. ' +
    'Applications rely on data-driven models.';

  (*
      Tokenize both texts
  *)
  words1 := tokenize(text1);
  words2 := tokenize(text2);

  (*
      Find keyword matches (set intersection)
  *)
  matches := findMatches(words1, words2);

  (*
      Output results
  *)
  WriteLn('Keywords in Text 1:');
  for i := 0 to High(words1) do Write(words1[i], ' ');

  WriteLn(#10#10'Keywords in Text 2:');
  for i := 0 to High(words2) do Write(words2[i], ' ');

  WriteLn(#10#10'Matched Keywords:');
  for i := 0 to High(matches) do Write(matches[i], ' ');

  WriteLn;
end.




(*
run:

Keywords in Text 1:
machine learning allows computers to learn from data it is widely used in modern applications 

Keywords in Text 2:
data science uses machine learning techniques applications rely on driven models 

Matched Keywords:
machine learning data applications 

*)

 



answered 10 hours ago by avibootz
edited 9 hours ago by avibootz

Related questions

...