#include <iostream>
#include <string>
#include <regex>
#include <codecvt> // codecvt_utf8
std::string standardize_text(const std::string& text) {
std::string result = text;
// Convert to lowercase
std::transform(result.begin(), result.end(), result.begin(), ::tolower);
// Normalize unicode characters to ASCII
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
std::wstring wide_string = converter.from_bytes(result);
std::string normalized;
for (wchar_t wch : wide_string) {
if (wch < 256) {
normalized += static_cast<char>(wch);
}
}
// Remove punctuation
normalized = std::regex_replace(normalized, std::regex(R"([^\w\s])"), "");
// Remove extra whitespace
normalized = std::regex_replace(normalized, std::regex(R"(\s+)"), " ");
return normalized;
}
int main() {
std::string sentence1 = "the Quick, BROWN Fox Isnt Jumps OVER the lazy dog!!!";
std::string sentence2 = "The quick; BROWN big Fox Isn't Jumps OVER the lãzy dog!";
std::string std_sentence1 = standardize_text(sentence1);
std::string std_sentence2 = standardize_text(sentence2);
std::cout << std_sentence1 << std::endl;
std::cout << std_sentence2 << std::endl;
}
/*
run:
the quick brown fox isnt jumps over the lazy dog
the quick brown big fox isnt jumps over the lzy dog
*/