How to implement basic text standardization in Python

1 Answer

import re
import unicodedata

def standardize_text(text:str) -> str:
    text = text.lower()
    
    # Normalize unicode characters to ASCII
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

sentence1 = "the Quick, BROWN Fox Isnt Jumps OVER the lazy dog!!!"
sentence2 = "The quick; BROWN big Fox Isn't Jumps OVER the lãzy dog!"

std_sentence1 = standardize_text(sentence1)
std_sentence2 = standardize_text(sentence2)

print(std_sentence1)
print(std_sentence2)



'''
run:

the quick brown fox isnt jumps over the lazy dog
the quick brown big fox isnt jumps over the lazy dog

'''

70+ SQL courses for beginners and professionals

answered Nov 24, 2024 by avibootz

Most popular tags

How to implement basic text standardization in Python

1 Answer

Related questions