DerivedFunction1 commited on
Commit
4feb4fd
·
1 Parent(s): f241861
Files changed (4) hide show
  1. app.py +1 -1
  2. language.py +0 -8
  3. language_aliases.json +3 -0
  4. source_config.py +0 -251
app.py CHANGED
@@ -24,7 +24,7 @@ from sib200_cache import fetch_random_sib200_sentence, fetch_random_sib200_sente
24
  from tatoeba import fetch_random_tatoeba_sentence, fetch_random_tatoeba_sentence_mix
25
 
26
 
27
- MODEL_CHECKPOINT = "DerivedFunction/polyglot-tagger-v2.1"
28
  FASTTEXT_MODEL_REPO = "facebook/fasttext-language-identification"
29
  FASTTEXT_MODEL_FILENAME = "model.bin"
30
  FASTTEXT_MIN_CONFIDENCE = 0.15
 
24
  from tatoeba import fetch_random_tatoeba_sentence, fetch_random_tatoeba_sentence_mix
25
 
26
 
27
+ MODEL_CHECKPOINT = "DerivedFunction/polyglot-tagger-v2.2"
28
  FASTTEXT_MODEL_REPO = "facebook/fasttext-language-identification"
29
  FASTTEXT_MODEL_FILENAME = "model.bin"
30
  FASTTEXT_MIN_CONFIDENCE = 0.15
language.py CHANGED
@@ -1,17 +1,10 @@
1
  from __future__ import annotations
2
 
3
  import json
4
- import os
5
  from pathlib import Path
6
 
7
  import pycountry
8
 
9
- from source_config import LANGUAGE_BUCKETS
10
-
11
- LANGUAGE_GROUPS = {group: cfg["langs"] for group, cfg in LANGUAGE_BUCKETS.items()}
12
- LANGUAGE_GROUP_WEIGHTS = {group: float(cfg["weight"]) for group, cfg in LANGUAGE_BUCKETS.items()}
13
- LANGUAGE_GROUP_MIN_CHARS = {group: int(cfg["min_chars"]) for group, cfg in LANGUAGE_BUCKETS.items()}
14
- LATIN_GROUPS = {group for group, cfg in LANGUAGE_BUCKETS.items() if cfg.get("latin")}
15
  LANGUAGE_ALIASES_JSON = Path(__file__).with_name("language_aliases.json")
16
 
17
 
@@ -48,7 +41,6 @@ LANG_ALIASES = {
48
  for alias in aliases
49
  }
50
  LANG_SOURCE_CODES = {canonical: tuple(aliases) for canonical, aliases in LANGUAGE_ALIASES.items()}
51
- LANG_TO_GROUP = {lang: group for group, langs in LANGUAGE_GROUPS.items() for lang in langs}
52
 
53
 
54
  def canonical_lang(lang: str) -> str:
 
1
  from __future__ import annotations
2
 
3
  import json
 
4
  from pathlib import Path
5
 
6
  import pycountry
7
 
 
 
 
 
 
 
8
  LANGUAGE_ALIASES_JSON = Path(__file__).with_name("language_aliases.json")
9
 
10
 
 
41
  for alias in aliases
42
  }
43
  LANG_SOURCE_CODES = {canonical: tuple(aliases) for canonical, aliases in LANGUAGE_ALIASES.items()}
 
44
 
45
 
46
  def canonical_lang(lang: str) -> str:
language_aliases.json CHANGED
@@ -52,6 +52,7 @@
52
  "ug": ["uig"],
53
  "el": ["ell", "gre", "grc"],
54
  "he": ["heb", "iw", "hbo"],
 
55
  "hy": ["hye", "hyw"],
56
  "ka": ["kat"],
57
  "am": ["amh"],
@@ -66,6 +67,7 @@
66
  "sw": ["swa", "swh"],
67
  "eu": ["eus"],
68
  "tl": ["fil", "tgl"],
 
69
  "ca": ["cat"],
70
  "gl": ["glg"],
71
  "oc": ["oci"],
@@ -73,6 +75,7 @@
73
  "ga": ["gle"],
74
  "gd": ["gla"],
75
  "cy": ["cym"],
 
76
  "bs": ["bos"],
77
  "hr": ["hrv"],
78
  "sl": ["slv"],
 
52
  "ug": ["uig"],
53
  "el": ["ell", "gre", "grc"],
54
  "he": ["heb", "iw", "hbo"],
55
+ "yi": ["yid", "ydd", "yds"],
56
  "hy": ["hye", "hyw"],
57
  "ka": ["kat"],
58
  "am": ["amh"],
 
67
  "sw": ["swa", "swh"],
68
  "eu": ["eus"],
69
  "tl": ["fil", "tgl"],
70
+ "xh": ["xho"],
71
  "ca": ["cat"],
72
  "gl": ["glg"],
73
  "oc": ["oci"],
 
75
  "ga": ["gle"],
76
  "gd": ["gla"],
77
  "cy": ["cym"],
78
+ "sco": [],
79
  "bs": ["bos"],
80
  "hr": ["hrv"],
81
  "sl": ["slv"],
source_config.py DELETED
@@ -1,251 +0,0 @@
1
- from __future__ import annotations
2
- LANGUAGE_BUCKETS = {
3
- # ~41% of CC — intentionally capped to avoid crowding out other languages
4
- "English": {
5
- "langs": ["en"],
6
- "weight": 2.9,
7
- "min_chars": 2_000,
8
- "latin": True,
9
- },
10
- # ~6.3% of CC — was badly underweighted relative to German/French
11
- "Russian": {
12
- "langs": ["ru"],
13
- "weight": 1.95,
14
- "min_chars": 2_000,
15
- "latin": False,
16
- },
17
- # ~5.9% of CC
18
- "German": {
19
- "langs": ["de"],
20
- "weight": 1.9,
21
- "min_chars": 2_000,
22
- "latin": True,
23
- },
24
- # ~5.7% of CC — bumped up from 1.7 to match its actual footprint
25
- "Japanese": {
26
- "langs": ["ja"],
27
- "weight": 1.9,
28
- "min_chars": 1_200,
29
- "latin": False,
30
- },
31
- # ~5.0% of CC — CC likely undercounts due to Great Firewall
32
- "Chinese": {
33
- "langs": ["zh"],
34
- "weight": 1.9,
35
- "min_chars": 1_200,
36
- "latin": False,
37
- },
38
- # ~4.6% of CC
39
- "French": {
40
- "langs": ["fr"],
41
- "weight": 1.9,
42
- "min_chars": 2_000,
43
- "latin": True,
44
- },
45
- # ~4.6% of CC
46
- "Spanish": {
47
- "langs": ["es"],
48
- "weight": 1.9,
49
- "min_chars": 2_000,
50
- "latin": True,
51
- },
52
- # ~2.5% of CC
53
- "Portuguese": {
54
- "langs": ["pt"],
55
- "weight": 1.7,
56
- "min_chars": 2_000,
57
- "latin": True,
58
- },
59
- # ~2.4% of CC
60
- "Italian": {
61
- "langs": ["it"],
62
- "weight": 1.6,
63
- "min_chars": 2_000,
64
- "latin": True,
65
- },
66
- # ~2.0% of CC — split out from CentralEuropeanLatin; rivals Italian/Portuguese
67
- "Polish": {
68
- "langs": ["pl"],
69
- "weight": 1.55,
70
- "min_chars": 2_000,
71
- "latin": True,
72
- },
73
- # ~1.8% of CC — was significantly underweighted at 1.15
74
- "Dutch": {
75
- "langs": ["nl"],
76
- "weight": 1.55,
77
- "min_chars": 2_000,
78
- "latin": True,
79
- },
80
- # ~1.2% of CC — split out from CentralEuropeanLatin; large internet population
81
- "Turkish": {
82
- "langs": ["tr"],
83
- "weight": 1.45,
84
- "min_chars": 2_000,
85
- "latin": True,
86
- },
87
- # ind ~1.1%, vie ~1.05% of CC
88
- "SoutheastAsianLatin": {
89
- "langs": ["vi", "id", "ms", "sq", "la"],
90
- "weight": 1.55,
91
- "min_chars": 2_000,
92
- "latin": True,
93
- },
94
- "WesternLatin": {
95
- "langs": ["ca", "gl", "oc"],
96
- "weight": 1.2,
97
- "min_chars": 1_500,
98
- "latin": True,
99
- },
100
- "CelticLatin": {
101
- "langs": ["br", "ga", "gd", "cy"],
102
- "weight": 1.3,
103
- "min_chars": 1_500,
104
- "latin": True,
105
- },
106
- "AdriaticLatin": {
107
- "langs": ["bs", "hr", "sl", "sk"],
108
- "weight": 1.4,
109
- "min_chars": 1_500,
110
- "latin": True,
111
- },
112
- "BalticLatin": {
113
- "langs": ["et", "lv", "lt"],
114
- "weight": 1.2,
115
- "min_chars": 1_500,
116
- "latin": True,
117
- },
118
- # ces ~1.14%, ron ~0.53%, hun ~0.52% of CC — smaller tier after splitting out pl/tr
119
- "CentralEuropeanLatin": {
120
- "langs": ["cs", "ro", "hu"],
121
- "weight": 1.3,
122
- "min_chars": 2_000,
123
- "latin": True,
124
- },
125
- # ~0.81% of CC — was overweighted at 1.7
126
- "Korean": {
127
- "langs": ["ko"],
128
- "weight": 1.35,
129
- "min_chars": 1_200,
130
- "latin": False,
131
- },
132
- # ukr ~0.70%, bel ~0.017% of CC
133
- "EastSlavicCyrillic": {
134
- "langs": ["uk", "be"],
135
- "weight": 1.7,
136
- "min_chars": 2_000,
137
- "latin": False,
138
- },
139
- # ~0.65% of CC — upweighted relative to CC share given speaker population
140
- "Arabic": {
141
- "langs": ["ar"],
142
- "weight": 1.4,
143
- "min_chars": 2_000,
144
- "latin": False,
145
- },
146
- "Norwegian": {
147
- "langs": ["no"],
148
- "weight": 1.0,
149
- "min_chars": 2_000,
150
- "latin": True,
151
- },
152
- # sv ~0.7%, dan ~0.51%, fin ~0.37%, isl ~0.04%, afr ~0.01%
153
- # combined ~2.0% of CC — was drastically overweighted at 6.0
154
- # note: Swedish Wikipedia is heavily bot-generated stubs, don't rely on article count
155
- "NordicCore": {
156
- "langs": ["sv", "da", "is", "af", "fi"],
157
- "weight": 2.1,
158
- "min_chars": 2_000,
159
- "latin": True,
160
- },
161
- # bul ~0.27%, srp ~0.25%, mkd ~0.037% of CC
162
- "BalkanCyrillic": {
163
- "langs": ["bg", "sr", "mk"],
164
- "weight": 1.05,
165
- "min_chars": 2_000,
166
- "latin": False,
167
- },
168
- # fas ~0.20% of CC (ignore the one anomalous crawl spike)
169
- "ArabicOther": {
170
- "langs": ["fa", "ps", "sd", "ug"],
171
- "weight": 0.95,
172
- "min_chars": 2_000,
173
- "latin": False,
174
- },
175
- # ~0.22% of CC — genuine web underrepresentation relative to speaker count,
176
- # but corpus is thin; 1.0 avoids oversampling a small pool
177
- "Hindi": {
178
- "langs": ["hi"],
179
- "weight": 1.0,
180
- "min_chars": 2_000,
181
- "latin": False,
182
- },
183
- # combined ~0.27% of CC — upweighted for script diversity
184
- "IndicOther": {
185
- "langs": [
186
- "ur",
187
- "bn",
188
- "ta",
189
- "te",
190
- "mr",
191
- "gu",
192
- "kn",
193
- "ml",
194
- "pa",
195
- "as",
196
- "or",
197
- "ne",
198
- ],
199
- "weight": 0.95,
200
- "min_chars": 2_000,
201
- "latin": False,
202
- },
203
- # kk ~0.038%, mn ~0.016% of CC — very thin corpus, weight is already a large relative boost
204
- "CentralAsianCaucusCyrillic": {
205
- "langs": ["kk", "mn", "tt", "ky", "tg", "ba", "ce"],
206
- "weight": 1.1,
207
- "min_chars": 2_000,
208
- "latin": False,
209
- },
210
- # Kurdish is split by script/source:
211
- # - ku: Wikipedia / Latin-script Kurdish
212
- # - ckb: FineTranslations / Arabic-script Kurdish
213
- "KurdishLatin": {
214
- "langs": ["ku"],
215
- "weight": 0.45,
216
- "min_chars": 1_500,
217
- "latin": True,
218
- },
219
- "KurdishArabic": {
220
- "langs": ["ckb"],
221
- "weight": 0.45,
222
- "min_chars": 2_000,
223
- "latin": False,
224
- },
225
- "AfricanLatin": {
226
- "langs": ["sw", "tl", "eu", "yo", "zu", "ny"],
227
- "weight": 1.0,
228
- "min_chars": 1_500,
229
- "latin": True,
230
- },
231
- "PeripheralLatin": {
232
- "langs": ["eo", "jv", "lb", "mg", "mt", "om", "rm", "so", "su", "uz"],
233
- "weight": 1.0,
234
- "min_chars": 1_500,
235
- "latin": True,
236
- },
237
- # Split the remaining non-Latin scripts into two buckets to keep
238
- # Greco-Semitic/Caucasus-style scripts separate from Brahmic/Tibetan ones.
239
- "OtherScriptsWest": {
240
- "langs": ["el", "he", "hy", "ka", "am", "ti", "dv", "hbo", "grc"],
241
- "weight": 1.0,
242
- "min_chars": 2_000,
243
- "latin": False,
244
- },
245
- "OtherScriptsEast": {
246
- "langs": ["km", "lo", "my", "th", "si", "bo"],
247
- "weight": 1.0,
248
- "min_chars": 2_000,
249
- "latin": False,
250
- },
251
- }