Files
cambot/scripts/wordlize.py
2022-03-31 19:44:24 +02:00

23 lines
468 B
Python

import sys
import re
from unidecode import unidecode
file = sys.argv[1]
if file == "-":
lines = sys.stdin
else:
with open(file, "r") as f:
lines = f.read().splitlines()
output = set()
for line in lines:
wordlized = unidecode(line).strip().upper()
if not re.match(r"^[A-Z]*$", wordlized): # ignore words with dashes, apostrophes...
continue
output.add(wordlized)
output = sorted(list(output))
for line in output:
print(line)