Spaces:

polyglot-tagger
/

language-extractor-demo

Sleeping

App Files Files Community

DerivedFunction1 commited on Apr 18

Commit

4feb4fd

1 Parent(s): f241861

update

Browse files

Files changed (4) hide show

app.py +1 -1
language.py +0 -8
language_aliases.json +3 -0
source_config.py +0 -251

app.py CHANGED Viewed

@@ -24,7 +24,7 @@ from sib200_cache import fetch_random_sib200_sentence, fetch_random_sib200_sente
 from tatoeba import fetch_random_tatoeba_sentence, fetch_random_tatoeba_sentence_mix
-MODEL_CHECKPOINT = "DerivedFunction/polyglot-tagger-v2.1"
 FASTTEXT_MODEL_REPO = "facebook/fasttext-language-identification"
 FASTTEXT_MODEL_FILENAME = "model.bin"
 FASTTEXT_MIN_CONFIDENCE = 0.15

 from tatoeba import fetch_random_tatoeba_sentence, fetch_random_tatoeba_sentence_mix
+MODEL_CHECKPOINT = "DerivedFunction/polyglot-tagger-v2.2"
 FASTTEXT_MODEL_REPO = "facebook/fasttext-language-identification"
 FASTTEXT_MODEL_FILENAME = "model.bin"
 FASTTEXT_MIN_CONFIDENCE = 0.15

language.py CHANGED Viewed

@@ -1,17 +1,10 @@
 from __future__ import annotations
 import json
-import os
 from pathlib import Path
 import pycountry
-from source_config import LANGUAGE_BUCKETS
-LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items()}
-LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
-LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
-LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
 LANGUAGE_ALIASES_JSON = Path(__file__).with_name("language_aliases.json")
@@ -48,7 +41,6 @@ LANG_ALIASES = {
     for alias in aliases
 }
 LANG_SOURCE_CODES = {canonical: tuple(aliases) for canonical, aliases in LANGUAGE_ALIASES.items()}
-LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
 def canonical_lang(lang: str) -> str:

 from __future__ import annotations
 import json
 from pathlib import Path
 import pycountry
 LANGUAGE_ALIASES_JSON = Path(__file__).with_name("language_aliases.json")
     for alias in aliases
 }
 LANG_SOURCE_CODES = {canonical: tuple(aliases) for canonical, aliases in LANGUAGE_ALIASES.items()}
 def canonical_lang(lang: str) -> str:

language_aliases.json CHANGED Viewed

@@ -52,6 +52,7 @@
   "ug": ["uig"],
   "el": ["ell", "gre", "grc"],
   "he": ["heb", "iw", "hbo"],
   "hy": ["hye", "hyw"],
   "ka": ["kat"],
   "am": ["amh"],
@@ -66,6 +67,7 @@
   "sw": ["swa", "swh"],
   "eu": ["eus"],
   "tl": ["fil", "tgl"],
   "ca": ["cat"],
   "gl": ["glg"],
   "oc": ["oci"],
@@ -73,6 +75,7 @@
   "ga": ["gle"],
   "gd": ["gla"],
   "cy": ["cym"],
   "bs": ["bos"],
   "hr": ["hrv"],
   "sl": ["slv"],

   "ug": ["uig"],
   "el": ["ell", "gre", "grc"],
   "he": ["heb", "iw", "hbo"],
+  "yi": ["yid", "ydd", "yds"],
   "hy": ["hye", "hyw"],
   "ka": ["kat"],
   "am": ["amh"],
   "sw": ["swa", "swh"],
   "eu": ["eus"],
   "tl": ["fil", "tgl"],
+  "xh": ["xho"],
   "ca": ["cat"],
   "gl": ["glg"],
   "oc": ["oci"],
   "ga": ["gle"],
   "gd": ["gla"],
   "cy": ["cym"],
+  "sco": [],
   "bs": ["bos"],
   "hr": ["hrv"],
   "sl": ["slv"],

source_config.py DELETED Viewed

@@ -1,251 +0,0 @@
-from __future__ import annotations
-LANGUAGE_BUCKETS = {
-    # ~41% of CC — intentionally capped to avoid crowding out other languages
-    "English": {
-        "langs": ["en"],
-        "weight": 2.9,
-        "min_chars": 2_000,
-        "latin": True,
-    },
-    # ~6.3% of CC — was badly underweighted relative to German/French
-    "Russian": {
-        "langs": ["ru"],
-        "weight": 1.95,
-        "min_chars": 2_000,
-        "latin": False,
-    },
-    # ~5.9% of CC
-    "German": {
-        "langs": ["de"],
-        "weight": 1.9,
-        "min_chars": 2_000,
-        "latin": True,
-    },
-    # ~5.7% of CC — bumped up from 1.7 to match its actual footprint
-    "Japanese": {
-        "langs": ["ja"],
-        "weight": 1.9,
-        "min_chars": 1_200,
-        "latin": False,
-    },
-    # ~5.0% of CC — CC likely undercounts due to Great Firewall
-    "Chinese": {
-        "langs": ["zh"],
-        "weight": 1.9,
-        "min_chars": 1_200,
-        "latin": False,
-    },
-    # ~4.6% of CC
-    "French": {
-        "langs": ["fr"],
-        "weight": 1.9,
-        "min_chars": 2_000,
-        "latin": True,
-    },
-    # ~4.6% of CC
-    "Spanish": {
-        "langs": ["es"],
-        "weight": 1.9,
-        "min_chars": 2_000,
-        "latin": True,
-    },
-    # ~2.5% of CC
-    "Portuguese": {
-        "langs": ["pt"],
-        "weight": 1.7,
-        "min_chars": 2_000,
-        "latin": True,
-    },
-    # ~2.4% of CC
-    "Italian": {
-        "langs": ["it"],
-        "weight": 1.6,
-        "min_chars": 2_000,
-        "latin": True,
-    },
-    # ~2.0% of CC — split out from CentralEuropeanLatin; rivals Italian/Portuguese
-    "Polish": {
-        "langs": ["pl"],
-        "weight": 1.55,
-        "min_chars": 2_000,
-        "latin": True,
-    },
-    # ~1.8% of CC — was significantly underweighted at 1.15
-    "Dutch": {
-        "langs": ["nl"],
-        "weight": 1.55,
-        "min_chars": 2_000,
-        "latin": True,
-    },
-    # ~1.2% of CC — split out from CentralEuropeanLatin; large internet population
-    "Turkish": {
-        "langs": ["tr"],
-        "weight": 1.45,
-        "min_chars": 2_000,
-        "latin": True,
-    },
-    # ind ~1.1%, vie ~1.05% of CC
-    "SoutheastAsianLatin": {
-        "langs": ["vi", "id", "ms", "sq", "la"],
-        "weight": 1.55,
-        "min_chars": 2_000,
-        "latin": True,
-    },
-    "WesternLatin": {
-        "langs": ["ca", "gl", "oc"],
-        "weight": 1.2,
-        "min_chars": 1_500,
-        "latin": True,
-    },
-    "CelticLatin": {
-        "langs": ["br", "ga", "gd", "cy"],
-        "weight": 1.3,
-        "min_chars": 1_500,
-        "latin": True,
-    },
-    "AdriaticLatin": {
-        "langs": ["bs", "hr", "sl", "sk"],
-        "weight": 1.4,
-        "min_chars": 1_500,
-        "latin": True,
-    },
-    "BalticLatin": {
-        "langs": ["et", "lv", "lt"],
-        "weight": 1.2,
-        "min_chars": 1_500,
-        "latin": True,
-    },
-    # ces ~1.14%, ron ~0.53%, hun ~0.52% of CC — smaller tier after splitting out pl/tr
-    "CentralEuropeanLatin": {
-        "langs": ["cs", "ro", "hu"],
-        "weight": 1.3,
-        "min_chars": 2_000,
-        "latin": True,
-    },
-    # ~0.81% of CC — was overweighted at 1.7
-    "Korean": {
-        "langs": ["ko"],
-        "weight": 1.35,
-        "min_chars": 1_200,
-        "latin": False,
-    },
-    # ukr ~0.70%, bel ~0.017% of CC
-    "EastSlavicCyrillic": {
-        "langs": ["uk", "be"],
-        "weight": 1.7,
-        "min_chars": 2_000,
-        "latin": False,
-    },
-    # ~0.65% of CC — upweighted relative to CC share given speaker population
-    "Arabic": {
-        "langs": ["ar"],
-        "weight": 1.4,
-        "min_chars": 2_000,
-        "latin": False,
-    },
-    "Norwegian": {
-        "langs": ["no"],
-        "weight": 1.0,
-        "min_chars": 2_000,
-        "latin": True,
-    },
-    # sv ~0.7%, dan ~0.51%, fin ~0.37%, isl ~0.04%, afr ~0.01%
-    # combined ~2.0% of CC — was drastically overweighted at 6.0
-    # note: Swedish Wikipedia is heavily bot-generated stubs, don't rely on article count
-    "NordicCore": {
-        "langs": ["sv", "da", "is", "af", "fi"],
-        "weight": 2.1,
-        "min_chars": 2_000,
-        "latin": True,
-    },
-    # bul ~0.27%, srp ~0.25%, mkd ~0.037% of CC
-    "BalkanCyrillic": {
-        "langs": ["bg", "sr", "mk"],
-        "weight": 1.05,
-        "min_chars": 2_000,
-        "latin": False,
-    },
-    # fas ~0.20% of CC (ignore the one anomalous crawl spike)
-    "ArabicOther": {
-        "langs": ["fa", "ps", "sd", "ug"],
-        "weight": 0.95,
-        "min_chars": 2_000,
-        "latin": False,
-    },
-    # ~0.22% of CC — genuine web underrepresentation relative to speaker count,
-    # but corpus is thin; 1.0 avoids oversampling a small pool
-    "Hindi": {
-        "langs": ["hi"],
-        "weight": 1.0,
-        "min_chars": 2_000,
-        "latin": False,
-    },
-    # combined ~0.27% of CC — upweighted for script diversity
-    "IndicOther": {
-        "langs": [
-            "ur",
-            "bn",
-            "ta",
-            "te",
-            "mr",
-            "gu",
-            "kn",
-            "ml",
-            "pa",
-            "as",
-            "or",
-            "ne",
-        ],
-        "weight": 0.95,
-        "min_chars": 2_000,
-        "latin": False,
-    },
-    # kk ~0.038%, mn ~0.016% of CC — very thin corpus, weight is already a large relative boost
-    "CentralAsianCaucusCyrillic": {
-        "langs": ["kk", "mn", "tt", "ky", "tg", "ba", "ce"],
-        "weight": 1.1,
-        "min_chars": 2_000,
-        "latin": False,
-    },
-    # Kurdish is split by script/source:
-    # - ku: Wikipedia / Latin-script Kurdish
-    # - ckb: FineTranslations / Arabic-script Kurdish
-    "KurdishLatin": {
-        "langs": ["ku"],
-        "weight": 0.45,
-        "min_chars": 1_500,
-        "latin": True,
-    },
-    "KurdishArabic": {
-        "langs": ["ckb"],
-        "weight": 0.45,
-        "min_chars": 2_000,
-        "latin": False,
-    },
-    "AfricanLatin": {
-        "langs": ["sw", "tl", "eu", "yo", "zu", "ny"],
-        "weight": 1.0,
-        "min_chars": 1_500,
-        "latin": True,
-    },
-    "PeripheralLatin": {
-        "langs": ["eo", "jv", "lb", "mg", "mt", "om", "rm", "so", "su", "uz"],
-        "weight": 1.0,
-        "min_chars": 1_500,
-        "latin": True,
-    },
-    # Split the remaining non-Latin scripts into two buckets to keep
-    # Greco-Semitic/Caucasus-style scripts separate from Brahmic/Tibetan ones.
-    "OtherScriptsWest": {
-        "langs": ["el", "he", "hy", "ka", "am", "ti", "dv", "hbo", "grc"],
-        "weight": 1.0,
-        "min_chars": 2_000,
-        "latin": False,
-    },
-    "OtherScriptsEast": {
-        "langs": ["km", "lo", "my", "th", "si", "bo"],
-        "weight": 1.0,
-        "min_chars": 2_000,
-        "latin": False,
-    },
-}