Skip to content
Snippets Groups Projects
Commit 348a83fc authored by Michele Nottoli's avatar Michele Nottoli
Browse files

Wrote a script for localization.

parent 6465b8f1
Branches
No related tags found
1 merge request!6Localization
File moved
File moved
#!/usr/bin/python3
import copy
import json
import os
import sys
import langid
def detect_language(text):
return langid.classify(text)
def load(path):
with open(path, encoding="utf-8") as file:
notebook = json.load(file)
return notebook
def human_intervention(language, probability, text):
print(
f"I classified this text:\n`{text}`\nas {language} with a non normalized probability of {probability}.\n"
)
while True:
print(
"Can you help me identifying it? write one among 'en', 'de', 'eq' (for equations), then press enter"
)
language = input()
if language in ["en", "de", "eq"]:
print("\n\n")
return language
def write(path, notebook):
with open(path, "w", encoding="utf-8") as file:
json.dump(notebook, file, indent=4)
def assign_languages(notebook):
cells = notebook["cells"]
for i, cell in enumerate(cells):
if cell["cell_type"] == "markdown":
text = str(cell["source"]).strip("\n# []'")
language, probability = detect_language(text)
if probability >= -10 or language not in ["en", "de"]:
# langid is doing something strange, we need to intervene
language = human_intervention(language, probability, text)
notebook["cells"][i]["metadata"]["natural_language"] = language
def select_language(notebook, language):
cells = notebook["cells"]
selected_cells = []
for cell in cells:
if cell["cell_type"] == "markdown":
if cell["metadata"]["natural_language"] == language:
selected_cells.append(cell)
elif cell["metadata"]["natural_language"] == "eq":
selected_cells.append(cell)
else:
selected_cells.append(cell)
new_notebook = copy.deepcopy(notebook)
new_notebook["cells"] = selected_cells
return new_notebook
if __name__ == "__main__":
PATH = sys.argv[1]
BASENAME = os.path.splitext(os.path.basename(PATH))[0]
NOTEBOOK = load(PATH)
assign_languages(NOTEBOOK)
NOTEBOOK_EN = select_language(NOTEBOOK, "en")
NOTEBOOK_DE = select_language(NOTEBOOK, "de")
write(f"{BASENAME}_en.ipynb", NOTEBOOK_EN)
write(f"{BASENAME}_de.ipynb", NOTEBOOK_DE)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment