Initial commit

This commit is contained in:
2022-03-31 19:44:24 +02:00
commit 5998d74a37
16 changed files with 1597 additions and 0 deletions

View File

@@ -0,0 +1,20 @@
with open("grammalecte.txt", "r") as f:
lines = f.read().splitlines()
lines = lines[16:]
for line in lines:
(id_, fid, flexion, lemme, etiquettes, metagraphe, metaphone,
notes, semantique, etymologie, sous_dictionnaire, google_1_grams,
wikipedia, wikisource, litterature, total, doublons, multiples,
frequence, indice) = line.split("\t")
etiquettes = etiquettes.split()
if "nom" in etiquettes:
print(flexion)
elif "adj" in etiquettes:
print(flexion)
elif "adv" in etiquettes:
print(flexion)
elif "infi" in etiquettes:
print(flexion)

23
scripts/wordlize.py Normal file
View File

@@ -0,0 +1,23 @@
import sys
import re
from unidecode import unidecode
file = sys.argv[1]
if file == "-":
lines = sys.stdin
else:
with open(file, "r") as f:
lines = f.read().splitlines()
output = set()
for line in lines:
wordlized = unidecode(line).strip().upper()
if not re.match(r"^[A-Z]*$", wordlized): # ignore words with dashes, apostrophes...
continue
output.add(wordlized)
output = sorted(list(output))
for line in output:
print(line)