# Copyright 2023 LiveKit, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Lookup tables for language normalization. Kept separate to avoid bloating the class module."""

# ISO 639-3 to ISO 639-1 mapping (sourced from ElevenLabs plugin + extended)
ISO_639_3_TO_1: dict[str, str | None] = {
    "afr": "af",
    "amh": "am",
    "ara": "ar",
    "hye": "hy",
    "asm": "as",
    "ast": None,
    "aze": "az",
    "bel": "be",
    "ben": "bn",
    "bos": "bs",
    "bul": "bg",
    "mya": "my",
    "yue": None,
    "cat": "ca",
    "ceb": None,
    "cmn": "zh",
    "nya": "ny",
    "hrv": "hr",
    "ces": "cs",
    "dan": "da",
    "nld": "nl",
    "eng": "en",
    "est": "et",
    "fil": None,
    "fin": "fi",
    "fra": "fr",
    "ful": "ff",
    "glg": "gl",
    "lug": "lg",
    "kat": "ka",
    "deu": "de",
    "ell": "el",
    "guj": "gu",
    "hau": "ha",
    "heb": "he",
    "hin": "hi",
    "hun": "hu",
    "isl": "is",
    "ibo": "ig",
    "ind": "id",
    "gle": "ga",
    "ita": "it",
    "jpn": "ja",
    "jav": "jv",
    "kea": None,
    "kan": "kn",
    "kaz": "kk",
    "khm": "km",
    "kor": "ko",
    "kur": "ku",
    "kir": "ky",
    "lao": "lo",
    "lav": "lv",
    "lin": "ln",
    "lit": "lt",
    "luo": None,
    "ltz": "lb",
    "mkd": "mk",
    "msa": "ms",
    "mal": "ml",
    "mlt": "mt",
    "zho": "zh",
    "mri": "mi",
    "mar": "mr",
    "mon": "mn",
    "nep": "ne",
    "nso": None,
    "nor": "no",
    "oci": "oc",
    "ori": "or",
    "pus": "ps",
    "fas": "fa",
    "pol": "pl",
    "por": "pt",
    "pan": "pa",
    "ron": "ro",
    "rus": "ru",
    "srp": "sr",
    "sna": "sn",
    "snd": "sd",
    "slk": "sk",
    "slv": "sl",
    "som": "so",
    "spa": "es",
    "swa": "sw",
    "swe": "sv",
    "tam": "ta",
    "tgk": "tg",
    "tel": "te",
    "tha": "th",
    "tur": "tr",
    "ukr": "uk",
    "umb": None,
    "urd": "ur",
    "uzb": "uz",
    "vie": "vi",
    "cym": "cy",
    "wol": "wo",
    "xho": "xh",
    "zul": "zu",
}

# Language names to ISO 639-1 codes
# Covers NLTK punkt languages + common English names
LANGUAGE_NAMES_TO_CODE: dict[str, str] = {
    "afrikaans": "af",
    "albanian": "sq",
    "amharic": "am",
    "arabic": "ar",
    "armenian": "hy",
    "azerbaijani": "az",
    "basque": "eu",
    "belarusian": "be",
    "bengali": "bn",
    "bosnian": "bs",
    "bulgarian": "bg",
    "burmese": "my",
    "catalan": "ca",
    "chinese": "zh",
    "croatian": "hr",
    "czech": "cs",
    "danish": "da",
    "dutch": "nl",
    "english": "en",
    "estonian": "et",
    "finnish": "fi",
    "french": "fr",
    "galician": "gl",
    "georgian": "ka",
    "german": "de",
    "greek": "el",
    "gujarati": "gu",
    "hausa": "ha",
    "hebrew": "he",
    "hindi": "hi",
    "hungarian": "hu",
    "icelandic": "is",
    "indonesian": "id",
    "irish": "ga",
    "italian": "it",
    "japanese": "ja",
    "javanese": "jv",
    "kannada": "kn",
    "kazakh": "kk",
    "khmer": "km",
    "korean": "ko",
    "kurdish": "ku",
    "kyrgyz": "ky",
    "lao": "lo",
    "latvian": "lv",
    "lingala": "ln",
    "lithuanian": "lt",
    "luxembourgish": "lb",
    "macedonian": "mk",
    "malay": "ms",
    "malayalam": "ml",
    "maltese": "mt",
    "maori": "mi",
    "marathi": "mr",
    "mongolian": "mn",
    "nepali": "ne",
    "norwegian": "no",
    "occitan": "oc",
    "oriya": "or",
    "pashto": "ps",
    "persian": "fa",
    "polish": "pl",
    "portuguese": "pt",
    "punjabi": "pa",
    "romanian": "ro",
    "russian": "ru",
    "serbian": "sr",
    "shona": "sn",
    "sindhi": "sd",
    "slovak": "sk",
    "slovene": "sl",
    "slovenian": "sl",
    "somali": "so",
    "spanish": "es",
    "swahili": "sw",
    "swedish": "sv",
    "tagalog": "tl",
    "tamil": "ta",
    "tajik": "tg",
    "telugu": "te",
    "thai": "th",
    "turkish": "tr",
    "ukrainian": "uk",
    "urdu": "ur",
    "uzbek": "uz",
    "vietnamese": "vi",
    "welsh": "cy",
    "wolof": "wo",
    "xhosa": "xh",
    "yoruba": "yo",
    "zulu": "zu",
}

# Reverse mapping: ISO 639-1 code to language name (for NLTK compatibility)
CODE_TO_LANGUAGE_NAME: dict[str, str] = {v: k for k, v in LANGUAGE_NAMES_TO_CODE.items()}
# Resolve duplicates — prefer the more common name
CODE_TO_LANGUAGE_NAME["sl"] = "slovene"