diff --git a/0_Installation.ipynb b/src/0_Installation.ipynb similarity index 100% rename from 0_Installation.ipynb rename to src/0_Installation.ipynb diff --git a/1_Basics.ipynb b/src/1_Basics.ipynb similarity index 100% rename from 1_Basics.ipynb rename to src/1_Basics.ipynb diff --git a/src/localize.py b/src/localize.py new file mode 100755 index 0000000000000000000000000000000000000000..05b2d8d39241db941e0c2eeab58b9dcc971093f2 --- /dev/null +++ b/src/localize.py @@ -0,0 +1,85 @@ +#!/usr/bin/python3 + +import copy +import json +import os +import sys + +import langid + + +def detect_language(text): + return langid.classify(text) + + +def load(path): + with open(path, encoding="utf-8") as file: + notebook = json.load(file) + return notebook + + +def human_intervention(language, probability, text): + print( + f"I classified this text:\n`{text}`\nas {language} with a non normalized probability of {probability}.\n" + ) + while True: + print( + "Can you help me identifying it? write one among 'en', 'de', 'eq' (for equations), then press enter" + ) + language = input() + if language in ["en", "de", "eq"]: + print("\n\n") + return language + + +def write(path, notebook): + with open(path, "w", encoding="utf-8") as file: + json.dump(notebook, file, indent=4) + + +def assign_languages(notebook): + cells = notebook["cells"] + + for i, cell in enumerate(cells): + if cell["cell_type"] == "markdown": + text = str(cell["source"]).strip("\n# []'") + language, probability = detect_language(text) + + if probability >= -10 or language not in ["en", "de"]: + # langid is doing something strange, we need to intervene + language = human_intervention(language, probability, text) + + notebook["cells"][i]["metadata"]["natural_language"] = language + + +def select_language(notebook, language): + cells = notebook["cells"] + + selected_cells = [] + for cell in cells: + if cell["cell_type"] == "markdown": + if cell["metadata"]["natural_language"] == language: + selected_cells.append(cell) + elif cell["metadata"]["natural_language"] == "eq": + selected_cells.append(cell) + else: + selected_cells.append(cell) + + new_notebook = copy.deepcopy(notebook) + new_notebook["cells"] = selected_cells + + return new_notebook + + +if __name__ == "__main__": + PATH = sys.argv[1] + BASENAME = os.path.splitext(os.path.basename(PATH))[0] + + NOTEBOOK = load(PATH) + assign_languages(NOTEBOOK) + + NOTEBOOK_EN = select_language(NOTEBOOK, "en") + NOTEBOOK_DE = select_language(NOTEBOOK, "de") + + write(f"{BASENAME}_en.ipynb", NOTEBOOK_EN) + write(f"{BASENAME}_de.ipynb", NOTEBOOK_DE)