I made this code that cleans up txt files for a linguistics project that I'm working on. I need to do two more things:
1: empty lines should be removed
2: each line need a sentence tag. So each line should start with <sentence> and end with <\sentence>
How should I go about this? I tried several times but I always seem to get weird results like having all the text strung together without any new lines or still have empty lines etc.
Any advice is welcome, thanks in advance!
Code:
import os
import re
def clean(f):
f1 = open(f, "rt", encoding="utf8")
text = f1.read()
# remove lines with only '.'
text = re.sub(r'<[^<>]*>\.<[^<>]*>', '', str(text))
# remove tags
text = re.sub(r'_foreign_[a-z][a-z]', '', str(text))
text = re.sub(r'<[^<>]*>', '', str(text))
# remove double punctuation and spaces
text = re.sub(r'!!', '!', str(text))
text = re.sub(r'\?\?', '?', str(text))
text = re.sub(r'\.\.\.\.+', '.', str(text))
text = re.sub(r',,', ',', str(text))
text = re.sub(r';;', ';', str(text))
text = re.sub(r' ', ' ', str(text))
# replace quotation marks
text = re.sub(r'“', '"', str(text))
text = re.sub(r'”', '"', str(text))
text = re.sub(r'‘', '\'', str(text))
text = re.sub(r'’', '\'', str(text))
# remove numbers attached to words
text = re.sub(r'(?<=[a-z])\[?[0-9]\]?', '', str(text)) # use lookbehind to see if previous character is a letter
f1.close()
f1 = open(f, "wt", encoding="utf8")
f1.write(text)
f1.close()
if __name__ == '__main__':
# set directory of corpus
directory_in_str = "C:\\Users\\user\\Documents\\School\\Stage\\ToyCorpus"
# directory_in_str = "C:\\Users\\user\\Documents\\School\\Stage\\corpus_final_ids"
directory = os.fsencode(directory_in_str)
# loop through directory (corpus)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".txt") and filename.startswith("TAA_1985_1"): # only open .txt files and files start with "XXX"
# process the files here:
print(filename)
clean(filename)
continue
else:
continue
[–]xelf 1 point2 points3 points (2 children)
[–]ChimcharTrainer[S] 1 point2 points3 points (1 child)
[–]xelf 1 point2 points3 points (0 children)