How to implement basic text standardization in Python

1 Answer

0 votes
import re
import unicodedata

def standardize_text(text:str) -> str:
    text = text.lower()
    
    # Normalize unicode characters to ASCII
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

sentence1 = "the Quick, BROWN Fox Isnt Jumps OVER the lazy dog!!!"
sentence2 = "The quick; BROWN big Fox Isn't Jumps OVER the lãzy dog!"

std_sentence1 = standardize_text(sentence1)
std_sentence2 = standardize_text(sentence2)

print(std_sentence1)
print(std_sentence2)



'''
run:

the quick brown fox isnt jumps over the lazy dog
the quick brown big fox isnt jumps over the lazy dog

'''

 



answered Nov 24, 2024 by avibootz

Related questions

1 answer 87 views
2 answers 165 views
1 answer 206 views
206 views asked Nov 16, 2018 by avibootz
1 answer 137 views
137 views asked Dec 26, 2020 by avibootz
...