omarkamali commited on
Commit
33e70cf
·
verified ·
1 Parent(s): 6a2d0c9

Upload all models and assets for cdo (latest)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README.md +164 -129
  3. models/embeddings/aligned/cdo_128d.bin +3 -0
  4. models/embeddings/aligned/cdo_128d.meta.json +1 -0
  5. models/embeddings/aligned/cdo_128d.projection.npy +3 -0
  6. models/embeddings/aligned/cdo_128d_metadata.json +8 -0
  7. models/embeddings/aligned/cdo_32d.bin +3 -0
  8. models/embeddings/aligned/cdo_32d.meta.json +1 -0
  9. models/embeddings/aligned/cdo_32d.projection.npy +3 -0
  10. models/embeddings/aligned/cdo_32d_metadata.json +8 -0
  11. models/embeddings/aligned/cdo_64d.bin +3 -0
  12. models/embeddings/aligned/cdo_64d.meta.json +1 -0
  13. models/embeddings/aligned/cdo_64d.projection.npy +3 -0
  14. models/embeddings/aligned/cdo_64d_metadata.json +8 -0
  15. models/embeddings/monolingual/cdo_128d.bin +2 -2
  16. models/embeddings/monolingual/cdo_128d_metadata.json +1 -1
  17. models/embeddings/monolingual/cdo_32d.bin +2 -2
  18. models/embeddings/monolingual/cdo_32d_metadata.json +1 -1
  19. models/embeddings/monolingual/cdo_64d.bin +2 -2
  20. models/embeddings/monolingual/cdo_64d_metadata.json +1 -1
  21. models/subword_markov/cdo_markov_ctx1_subword.parquet +2 -2
  22. models/subword_markov/cdo_markov_ctx1_subword_metadata.json +2 -2
  23. models/subword_markov/cdo_markov_ctx2_subword.parquet +2 -2
  24. models/subword_markov/cdo_markov_ctx2_subword_metadata.json +2 -2
  25. models/subword_markov/cdo_markov_ctx3_subword.parquet +2 -2
  26. models/subword_markov/cdo_markov_ctx3_subword_metadata.json +2 -2
  27. models/subword_markov/cdo_markov_ctx4_subword.parquet +2 -2
  28. models/subword_markov/cdo_markov_ctx4_subword_metadata.json +2 -2
  29. models/subword_ngram/cdo_2gram_subword.parquet +2 -2
  30. models/subword_ngram/cdo_2gram_subword_metadata.json +2 -2
  31. models/subword_ngram/cdo_3gram_subword.parquet +2 -2
  32. models/subword_ngram/cdo_3gram_subword_metadata.json +2 -2
  33. models/subword_ngram/cdo_4gram_subword.parquet +2 -2
  34. models/subword_ngram/cdo_4gram_subword_metadata.json +2 -2
  35. models/subword_ngram/cdo_5gram_subword.parquet +3 -0
  36. models/subword_ngram/cdo_5gram_subword_metadata.json +7 -0
  37. models/tokenizer/cdo_tokenizer_32k.model +2 -2
  38. models/tokenizer/cdo_tokenizer_32k.vocab +0 -0
  39. models/tokenizer/cdo_tokenizer_64k.model +2 -2
  40. models/tokenizer/cdo_tokenizer_64k.vocab +0 -0
  41. models/vocabulary/cdo_vocabulary.parquet +2 -2
  42. models/vocabulary/cdo_vocabulary_metadata.json +9 -9
  43. models/word_markov/cdo_markov_ctx1_word.parquet +2 -2
  44. models/word_markov/cdo_markov_ctx1_word_metadata.json +2 -2
  45. models/word_markov/cdo_markov_ctx2_word.parquet +2 -2
  46. models/word_markov/cdo_markov_ctx2_word_metadata.json +2 -2
  47. models/word_markov/cdo_markov_ctx3_word.parquet +2 -2
  48. models/word_markov/cdo_markov_ctx3_word_metadata.json +2 -2
  49. models/word_markov/cdo_markov_ctx4_word.parquet +2 -2
  50. models/word_markov/cdo_markov_ctx4_word_metadata.json +2 -2
.gitattributes CHANGED
@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
 
 
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
42
+ visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  language: cdo
3
- language_name: CDO
4
  language_family: sinitic_other
5
  tags:
6
  - wikilangs
@@ -10,11 +10,21 @@ tags:
10
  - n-gram
11
  - markov
12
  - wikipedia
 
 
 
 
 
 
 
 
 
 
13
  - monolingual
14
  - family-sinitic_other
15
  license: mit
16
  library_name: wikilangs
17
- pipeline_tag: feature-extraction
18
  datasets:
19
  - omarkamali/wikipedia-monthly
20
  dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
23
  metrics:
24
  - name: best_compression_ratio
25
  type: compression
26
- value: 2.892
27
  - name: best_isotropy
28
  type: isotropy
29
- value: 0.5551
30
  - name: vocabulary_size
31
  type: vocab
32
  value: 0
33
  generated: 2026-01-03
34
  ---
35
 
36
- # CDO - Wikilangs Models
37
  ## Comprehensive Research Report & Full Ablation Study
38
 
39
- This repository contains NLP models trained and evaluated by Wikilangs, specifically on **CDO** Wikipedia data.
40
  We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
41
 
42
  ## 📋 Repository Contents
@@ -60,7 +70,7 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
60
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
61
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
62
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
63
- - [6. Morphological Analysis (Experimental)](#6-morphological-analysis)
64
  - [7. Summary & Recommendations](#7-summary--recommendations)
65
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
66
  - [Visualizations Index](#visualizations-index)
@@ -80,39 +90,39 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
80
 
81
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
82
  |------------|-------------|---------------|----------|--------------|
83
- | **32k** | 2.757x | 2.76 | 0.1034% | 257,325 |
84
- | **64k** | 2.892x 🏆 | 2.90 | 0.1084% | 245,327 |
85
 
86
  ### Tokenization Examples
87
 
88
  Below are sample sentences tokenized with each vocabulary size:
89
 
90
- **Sample 1:** `BashkortostanNgò̤-lò̤-sṳ̆ gì siŏh ciáh gê̤ṳng-huò-guók. gì gê̤ṳng-huò-guók`
91
 
92
  | Vocab | Tokens | Count |
93
  |-------|--------|-------|
94
- | 32k | `▁b ash k ort os tan ngò ̤- ... (+17 more)` | 27 |
95
- | 64k | `▁bash k ortos tan ▁sê ▁ngò ̤- ̤- sṳ̆ ... (+15 more)` | 25 |
96
 
97
- **Sample 2:** `Montague Gông (Ĭng-ngṳ̄: Montague County) Mī-guók Texassiŏh ciáh gông. gì...`
98
 
99
  | Vocab | Tokens | Count |
100
  |-------|--------|-------|
101
- | 32k | `▁mont a guegông( ĭng - ngṳ̄ : ▁mont ... (+16 more)` | 26 |
102
- | 64k | `▁montaguegông( ĭng - ngṳ̄ : montaguecounty ) ... (+12 more)` | 22 |
103
 
104
- **Sample 3:** `Ochiltree Gông (Ĭng-ngṳ̄: Ochiltree County) sê Mī-guók Texas gì siŏh ciáh gông. ...`
105
 
106
  | Vocab | Tokens | Count |
107
  |-------|--------|-------|
108
- | 32k | `▁o chi l t re egông( ĭng - ... (+22 more)` | 32 |
109
- | 64k | `▁ochiltree ▁gông ▁( ĭng - ngṳ̄ : ▁ochiltree ▁county ) ... (+12 more)` | 22 |
110
 
111
 
112
  ### Key Findings
113
 
114
- - **Best Compression:** 64k achieves 2.892x compression
115
- - **Lowest UNK Rate:** 32k with 0.1034% unknown tokens
116
  - **Trade-off:** Larger vocabularies improve compression but increase model size
117
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
118
 
@@ -129,12 +139,14 @@ Below are sample sentences tokenized with each vocabulary size:
129
 
130
  | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
131
  |--------|---------|------------|---------|----------------|------------------|-------------------|
132
- | **2-gram** | Word | 3,105 | 11.60 | 11,679 | 27.7% | 59.2% |
133
- | **2-gram** | Subword | 342 🏆 | 8.42 | 6,912 | 63.5% | 95.8% |
134
- | **3-gram** | Word | 4,698 | 12.20 | 17,954 | 23.8% | 52.2% |
135
- | **3-gram** | Subword | 1,659 | 10.70 | 21,000 | 36.1% | 75.8% |
136
- | **4-gram** | Word | 8,483 | 13.05 | 30,938 | 18.6% | 45.4% |
137
- | **4-gram** | Subword | 5,744 | 12.49 | 69,193 | 23.7% | 55.8% |
 
 
138
 
139
  ### Top 5 N-grams by Size
140
 
@@ -142,18 +154,18 @@ Below are sample sentences tokenized with each vocabulary size:
142
 
143
  | Rank | N-gram | Count |
144
  |------|--------|-------|
145
- | 1 | `gì siŏh` | 6,258 |
146
- | 2 | `siŏh ciáh` | 6,232 |
147
- | 3 | `mī guók` | 3,385 |
148
- | 4 | `sê mī` | 3,191 |
149
  | 5 | `gì gông` | 3,000 |
150
 
151
  **3-grams (Word):**
152
 
153
  | Rank | N-gram | Count |
154
  |------|--------|-------|
155
- | 1 | `gì siŏh ciáh` | 5,413 |
156
- | 2 | `sê mī guók` | 3,173 |
157
  | 3 | `siŏh ciáh gông` | 3,000 |
158
  | 4 | `ciáh gông gì` | 2,557 |
159
  | 5 | `gông gì gông` | 2,557 |
@@ -163,47 +175,67 @@ Below are sample sentences tokenized with each vocabulary size:
163
  | Rank | N-gram | Count |
164
  |------|--------|-------|
165
  | 1 | `gì siŏh ciáh gông` | 3,000 |
166
- | 2 | `ciáh gông gông` | 2,557 |
167
- | 3 | `siŏh ciáh gông gì` | 2,557 |
168
  | 4 | `county sê mī guók` | 1,971 |
169
  | 5 | `gông sê mī guók` | 1,029 |
170
 
 
 
 
 
 
 
 
 
 
 
171
  **2-grams (Subword):**
172
 
173
  | Rank | N-gram | Count |
174
  |------|--------|-------|
175
- | 1 | `n g` | 146,797 |
176
- | 2 | `_ g` | 59,970 |
177
- | 3 | `g -` | 55,946 |
178
- | 4 | `g _` | 55,139 |
179
- | 5 | `_ s` | 41,311 |
180
 
181
  **3-grams (Subword):**
182
 
183
  | Rank | N-gram | Count |
184
  |------|--------|-------|
185
- | 1 | `n g -` | 55,920 |
186
- | 2 | `n g _` | 55,025 |
187
- | 3 | `_ g ì` | 23,090 |
188
- | 4 | `g ì _` | 22,312 |
189
- | 5 | `_ s i` | 14,134 |
190
 
191
  **4-grams (Subword):**
192
 
193
  | Rank | N-gram | Count |
194
  |------|--------|-------|
195
- | 1 | `_ g ì _` | 22,161 |
196
- | 2 | `_ s ê _` | 13,231 |
197
- | 3 | `n g _ g` | 11,336 |
198
- | 4 | `i ŏ h _` | 10,632 |
199
- | 5 | `_ s i ŏ` | 9,391 |
 
 
 
 
 
 
 
 
 
 
200
 
201
 
202
  ### Key Findings
203
 
204
- - **Best Perplexity:** 2-gram (subword) with 342
205
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
206
- - **Coverage:** Top-1000 patterns cover ~56% of corpus
207
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
208
 
209
  ---
@@ -219,14 +251,14 @@ Below are sample sentences tokenized with each vocabulary size:
219
 
220
  | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
221
  |---------|---------|-------------|------------|------------------|-----------------|----------------|
222
- | **1** | Word | 0.4882 | 1.403 | 4.73 | 29,670 | 51.2% |
223
- | **1** | Subword | 0.3461 | 1.271 | 2.92 | 25,622 | 65.4% |
224
- | **2** | Word | 0.3187 | 1.247 | 1.80 | 139,308 | 68.1% |
225
- | **2** | Subword | 0.2753 | 1.210 | 1.79 | 74,780 | 72.5% |
226
- | **3** | Word | 0.1201 | 1.087 | 1.23 | 249,012 | 88.0% |
227
- | **3** | Subword | 0.2343 | 1.176 | 1.69 | 133,665 | 76.6% |
228
- | **4** | Word | 0.0526 🏆 | 1.037 | 1.09 | 301,670 | 94.7% |
229
- | **4** | Subword | 0.2290 | 1.172 | 1.53 | 225,577 | 77.1% |
230
 
231
  ### Generated Text Samples (Word-based)
232
 
@@ -234,27 +266,27 @@ Below are text samples generated from each word-based Markov chain model:
234
 
235
  **Context Size 1:**
236
 
237
- 1. `gì hiŏng 蘆洋鄉 bìng dēng có̤i kăi sṳ̄ háng nè̤ng gó̤ lō̤ 法老 gái`
238
- 2. `sê dṳ̆ng huà ìng chê 邢臺市 lòng 退之 ĕng diŏh adelaide ô 2 nguŏk`
239
- 3. `siŏh bĭk cék éng sáuk īng lĭk â̤ dé̤ṳng buŏng nàng áng turkic ngṳ̄`
240
 
241
  **Context Size 2:**
242
 
243
- 1. `gì siŏh cṳ̄ng lòi ĭng ôi sĭng ô 79 ciáh mŭk sṳ̆ 佳木斯 sê dṳ̆ng guók sĭng`
244
- 2. `siŏh ciáh chê hăk kṳ̆ 市轄區 lĭk sṳ̄ diē sié siŏh cṳ̄ng ciŏng muòng dò̤ lā̤`
245
- 3. `mī guók montana gì siŏh ciáh gônggông`
246
 
247
  **Context Size 3:**
248
 
249
- 1. `gì siŏh ciáh duâi kṳ̆ duâi kṳ̆`
250
- 2. `sê mī guók kansas siŏh ciáh mō̤ diŏh eta piăng âu gâe̤ng sigma sèng`
251
  3. `siŏh ciáh gông gì gông`
252
 
253
  **Context Size 4:**
254
 
255
  1. `gì siŏh ciáh gông gì gông`
256
  2. `siŏh ciáh gông gì gông`
257
- 3. `county sê mī guók nebraska gì siŏh ciáh gông gì gông`
258
 
259
 
260
  ### Generated Text Samples (Subword-based)
@@ -263,34 +295,34 @@ Below are text samples generated from each subword-based Markov chain model:
263
 
264
  **Context Size 1:**
265
 
266
- 1. `_ônià_hô̤_sênty)_`
267
- 2. `gì_ciônièng,_sṳ̄:`
268
- 3. `ngăngôner_g-ccáh`
269
 
270
  **Context Size 2:**
271
 
272
- 1. `nguô_ka_cĭ_gì_dâ̤_`
273
- 2. `_gì_sié-sê_hŏk-cê`
274
- 3. `g-hĭ_(獨聯體)_gông_n`
275
 
276
  **Context Size 3:**
277
 
278
- 1. `ng-hèng_biéng,_mac`
279
- 2. `ng_adahoma_gì_(兩個聲`
280
- 3. `_gì_siōng-dĕ̤ng-ŭk_`
281
 
282
  **Context Size 4:**
283
 
284
- 1. `_gì_«sṳ̀ng-kṳ̆_dĕk-bi`
285
- 2. `_sê_„發現更大的世界“)_có̤_c`
286
- 3. `ng_găk_chăng-muò_(𧋘`
287
 
288
 
289
  ### Key Findings
290
 
291
  - **Best Predictability:** Context-4 (word) with 94.7% predictability
292
  - **Branching Factor:** Decreases with context size (more deterministic)
293
- - **Memory Trade-off:** Larger contexts require more storage (225,577 contexts)
294
  - **Recommendation:** Context-3 or Context-4 for text generation
295
 
296
  ---
@@ -306,64 +338,64 @@ Below are text samples generated from each subword-based Markov chain model:
306
 
307
  | Metric | Value |
308
  |--------|-------|
309
- | Vocabulary Size | 9,559 |
310
- | Total Tokens | 467,385 |
311
- | Mean Frequency | 48.89 |
312
  | Median Frequency | 3 |
313
- | Frequency Std Dev | 395.71 |
314
 
315
  ### Most Common Words
316
 
317
  | Rank | Word | Frequency |
318
  |------|------|-----------|
319
- | 1 | gì | 23,295 |
320
- | 2 | sê | 14,068 |
321
- | 3 | siŏh | 9,247 |
322
  | 4 | gông | 9,087 |
323
- | 5 | guók | 8,549 |
324
- | 6 | ciáh | 7,131 |
325
- | 7 | nièng | 5,854 |
326
- | 8 | ngṳ̄ | 5,277 |
327
- | 9 | sié | 4,616 |
328
- | 10 | gáu | 4,179 |
329
 
330
  ### Least Common Words (from vocabulary)
331
 
332
  | Rank | Word | Frequency |
333
  |------|------|-----------|
334
- | 1 | woolridge | 2 |
335
- | 2 | imperiyası | 2 |
336
- | 3 | abş | 2 |
337
- | 4 | çox | 2 |
338
- | 5 | dünyada | 2 |
339
- | 6 | bütün | 2 |
340
- | 7 | 嘉祿 | 2 |
341
- | 8 | 六一路 | 2 |
342
- | 9 | 神壇樹 | 2 |
343
- | 10 | 신단수 | 2 |
344
 
345
  ### Zipf's Law Analysis
346
 
347
  | Metric | Value |
348
  |--------|-------|
349
- | Zipf Coefficient | 1.3995 |
350
- | R² (Goodness of Fit) | 0.957431 |
351
  | Adherence Quality | **excellent** |
352
 
353
  ### Coverage Analysis
354
 
355
  | Top N Words | Coverage |
356
  |-------------|----------|
357
- | Top 100 | 52.2% |
358
- | Top 1,000 | 91.7% |
359
  | Top 5,000 | 98.0% |
360
  | Top 10,000 | 0.0% |
361
 
362
  ### Key Findings
363
 
364
- - **Zipf Compliance:** R²=0.9574 indicates excellent adherence to Zipf's law
365
- - **High Frequency Dominance:** Top 100 words cover 52.2% of corpus
366
- - **Long Tail:** -441 words needed for remaining 100.0% coverage
367
 
368
  ---
369
  ## 5. Word Embeddings Evaluation
@@ -379,37 +411,40 @@ Below are text samples generated from each subword-based Markov chain model:
379
 
380
  ### 5.1 Cross-Lingual Alignment
381
 
382
- > *Note: Multilingual alignment visualization not available for this language.*
 
 
383
 
384
 
385
  ### 5.2 Model Comparison
386
 
387
  | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
388
  |-------|-----------|----------|------------------|---------------|----------------|
389
- | **mono_32d** | 32 | 0.5551 🏆 | 0.4156 | N/A | N/A |
390
- | **mono_64d** | 64 | 0.1856 | 0.4055 | N/A | N/A |
391
- | **mono_128d** | 128 | 0.0279 | 0.4128 | N/A | N/A |
 
 
 
392
 
393
  ### Key Findings
394
 
395
- - **Best Isotropy:** mono_32d with 0.5551 (more uniform distribution)
396
- - **Semantic Density:** Average pairwise similarity of 0.4113. Lower values indicate better semantic separation.
397
- - **Alignment Quality:** No aligned models evaluated in this run.
398
  - **Recommendation:** 128d aligned for best cross-lingual performance
399
 
400
  ---
401
  ## 6. Morphological Analysis (Experimental)
402
 
403
- > ⚠️ **Warning:** This language shows low morphological productivity. The statistical signals used for this analysis may be noisy or less reliable than for morphologically rich languages.
404
-
405
  This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
406
 
407
  ### 6.1 Productivity & Complexity
408
 
409
  | Metric | Value | Interpretation | Recommendation |
410
  |--------|-------|----------------|----------------|
411
- | Productivity Index | **0.000** | Low morphological productivity | ⚠️ Likely unreliable |
412
- | Idiomaticity Gap | **-1.000** | Low formulaic content | - |
413
 
414
  ### 6.2 Affix Inventory (Productive Units)
415
 
@@ -424,10 +459,10 @@ Bound stems are high-frequency subword units that are semantically cohesive but
424
 
425
  | Stem | Cohesion | Substitutability | Examples |
426
  |------|----------|------------------|----------|
427
- | `áung` | 1.97x | 9 contexts | táung, láung, dáung |
428
- | `âung` | 1.96x | 9 contexts | câung, bâung, hâung |
429
- | `iăng` | 1.80x | 7 contexts | hiăng, siăng, giăng |
430
- | `iāng` | 1.55x | 8 contexts | liāng, biāng, ciāng |
431
 
432
  ### 6.4 Affix Compatibility (Co-occurrence)
433
 
@@ -446,7 +481,7 @@ Using **Recursive Hierarchical Substitutability**, we decompose complex words in
446
  ### 6.6 Linguistic Interpretation
447
 
448
  > **Automated Insight:**
449
- The language CDO appears to be more isolating or has a highly fixed vocabulary. Word-level models perform nearly as well as subword models, indicating fewer productive morphological processes.
450
 
451
  ---
452
  ## 7. Summary & Recommendations
@@ -458,7 +493,7 @@ The language CDO appears to be more isolating or has a highly fixed vocabulary.
458
  | Component | Recommended | Rationale |
459
  |-----------|-------------|-----------|
460
  | Tokenizer | **64k BPE** | Best compression (2.89x) |
461
- | N-gram | **2-gram** | Lowest perplexity (342) |
462
  | Markov | **Context-4** | Highest predictability (94.7%) |
463
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
464
 
@@ -673,4 +708,4 @@ MIT License - Free for academic and commercial use.
673
  ---
674
  *Generated by Wikilangs Models Pipeline*
675
 
676
- *Report Date: 2026-01-03 09:43:04*
 
1
  ---
2
  language: cdo
3
+ language_name: Min Dong Chinese
4
  language_family: sinitic_other
5
  tags:
6
  - wikilangs
 
10
  - n-gram
11
  - markov
12
  - wikipedia
13
+ - feature-extraction
14
+ - sentence-similarity
15
+ - tokenization
16
+ - n-grams
17
+ - markov-chain
18
+ - text-mining
19
+ - fasttext
20
+ - babelvec
21
+ - vocabulous
22
+ - vocabulary
23
  - monolingual
24
  - family-sinitic_other
25
  license: mit
26
  library_name: wikilangs
27
+ pipeline_tag: text-generation
28
  datasets:
29
  - omarkamali/wikipedia-monthly
30
  dataset_info:
 
33
  metrics:
34
  - name: best_compression_ratio
35
  type: compression
36
+ value: 2.891
37
  - name: best_isotropy
38
  type: isotropy
39
+ value: 0.5099
40
  - name: vocabulary_size
41
  type: vocab
42
  value: 0
43
  generated: 2026-01-03
44
  ---
45
 
46
+ # Min Dong Chinese - Wikilangs Models
47
  ## Comprehensive Research Report & Full Ablation Study
48
 
49
+ This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Min Dong Chinese** Wikipedia data.
50
  We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
51
 
52
  ## 📋 Repository Contents
 
70
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
71
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
72
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
73
+ - [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
74
  - [7. Summary & Recommendations](#7-summary--recommendations)
75
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
76
  - [Visualizations Index](#visualizations-index)
 
90
 
91
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
92
  |------------|-------------|---------------|----------|--------------|
93
+ | **32k** | 2.755x | 2.76 | 0.1043% | 256,064 |
94
+ | **64k** | 2.891x 🏆 | 2.89 | 0.1094% | 244,079 |
95
 
96
  ### Tokenization Examples
97
 
98
  Below are sample sentences tokenized with each vocabulary size:
99
 
100
+ **Sample 1:** `Jessamine Gông (Ĭng-ngṳ̄: Jessamine County) Mī-guók Kentucky gì siŏh ciáh gôn...`
101
 
102
  | Vocab | Tokens | Count |
103
  |-------|--------|-------|
104
+ | 32k | `▁j ess am inegông( ĭng - ngṳ̄ : ... (+18 more)` | 28 |
105
+ | 64k | `▁jessamine ▁gông ▁( ĭng - ngṳ̄ : ▁jessamine ▁county ) ... (+12 more)` | 22 |
106
 
107
+ **Sample 2:** `2 nguŏk 1 hô̤nùng-lĭk 2 nguŏk dâ̤ 1 gĕ̤ng. 2 nguŏk`
108
 
109
  | Vocab | Tokens | Count |
110
  |-------|--------|-------|
111
+ | 32k | `▁ 2 ▁nguŏk1 hô̤ ▁sê ▁nùng - lĭk ... (+12 more)` | 22 |
112
+ | 64k | `▁ 2 nguŏk1 ▁hô̤nùng - lĭk ... (+12 more)` | 22 |
113
 
114
+ **Sample 3:** `McLean Gông (Ĭng-ngṳ̄: McLean County) sê Mī-guók Kentucky gì siŏh ciáh gông. ...`
115
 
116
  | Vocab | Tokens | Count |
117
  |-------|--------|-------|
118
+ | 32k | `▁mclean ▁gông ▁( ĭng - ngṳ̄ : mcleancounty ) ... (+12 more)` | 22 |
119
+ | 64k | `▁mclean ▁gông ▁( ĭng - ngṳ̄ : ▁mclean ▁county ) ... (+12 more)` | 22 |
120
 
121
 
122
  ### Key Findings
123
 
124
+ - **Best Compression:** 64k achieves 2.891x compression
125
+ - **Lowest UNK Rate:** 32k with 0.1043% unknown tokens
126
  - **Trade-off:** Larger vocabularies improve compression but increase model size
127
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
128
 
 
139
 
140
  | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
141
  |--------|---------|------------|---------|----------------|------------------|-------------------|
142
+ | **2-gram** | Word | 3,139 | 11.62 | 11,777 | 27.5% | 59.0% |
143
+ | **2-gram** | Subword | 341 🏆 | 8.41 | 6,920 | 63.6% | 95.8% |
144
+ | **3-gram** | Word | 4,753 | 12.21 | 18,116 | 23.7% | 52.0% |
145
+ | **3-gram** | Subword | 1,655 | 10.69 | 21,022 | 36.1% | 75.9% |
146
+ | **4-gram** | Word | 8,558 | 13.06 | 31,134 | 18.5% | 45.2% |
147
+ | **4-gram** | Subword | 5,737 | 12.49 | 69,190 | 23.7% | 55.8% |
148
+ | **5-gram** | Word | 7,101 | 12.79 | 23,547 | 17.3% | 48.1% |
149
+ | **5-gram** | Subword | 13,084 | 13.68 | 106,632 | 16.4% | 41.9% |
150
 
151
  ### Top 5 N-grams by Size
152
 
 
154
 
155
  | Rank | N-gram | Count |
156
  |------|--------|-------|
157
+ | 1 | `gì siŏh` | 6,261 |
158
+ | 2 | `siŏh ciáh` | 6,233 |
159
+ | 3 | `mī guók` | 3,384 |
160
+ | 4 | `sê mī` | 3,190 |
161
  | 5 | `gì gông` | 3,000 |
162
 
163
  **3-grams (Word):**
164
 
165
  | Rank | N-gram | Count |
166
  |------|--------|-------|
167
+ | 1 | `gì siŏh ciáh` | 5,415 |
168
+ | 2 | `sê mī guók` | 3,172 |
169
  | 3 | `siŏh ciáh gông` | 3,000 |
170
  | 4 | `ciáh gông gì` | 2,557 |
171
  | 5 | `gông gì gông` | 2,557 |
 
175
  | Rank | N-gram | Count |
176
  |------|--------|-------|
177
  | 1 | `gì siŏh ciáh gông` | 3,000 |
178
+ | 2 | `siŏh ciáh gông gì` | 2,557 |
179
+ | 3 | `ciáh gông gông` | 2,557 |
180
  | 4 | `county sê mī guók` | 1,971 |
181
  | 5 | `gông sê mī guók` | 1,029 |
182
 
183
+ **5-grams (Word):**
184
+
185
+ | Rank | N-gram | Count |
186
+ |------|--------|-------|
187
+ | 1 | `siŏh ciáh gông gì gông` | 2,557 |
188
+ | 2 | `gì siŏh ciáh gông gì` | 2,557 |
189
+ | 3 | `diē sié gì siŏh ciáh` | 390 |
190
+ | 4 | `ìng mìng gê̤ṳng huò guók` | 385 |
191
+ | 5 | `dâi chók sié guó sié` | 348 |
192
+
193
  **2-grams (Subword):**
194
 
195
  | Rank | N-gram | Count |
196
  |------|--------|-------|
197
+ | 1 | `n g` | 148,099 |
198
+ | 2 | `_ g` | 60,261 |
199
+ | 3 | `g -` | 56,437 |
200
+ | 4 | `g _` | 55,736 |
201
+ | 5 | `_ s` | 41,503 |
202
 
203
  **3-grams (Subword):**
204
 
205
  | Rank | N-gram | Count |
206
  |------|--------|-------|
207
+ | 1 | `n g -` | 56,411 |
208
+ | 2 | `n g _` | 55,623 |
209
+ | 3 | `_ g ì` | 23,145 |
210
+ | 4 | `g ì _` | 22,365 |
211
+ | 5 | `_ s i` | 14,188 |
212
 
213
  **4-grams (Subword):**
214
 
215
  | Rank | N-gram | Count |
216
  |------|--------|-------|
217
+ | 1 | `_ g ì _` | 22,216 |
218
+ | 2 | `_ s ê _` | 13,258 |
219
+ | 3 | `n g _ g` | 11,418 |
220
+ | 4 | `i ŏ h _` | 10,678 |
221
+ | 5 | `_ s i ŏ` | 9,423 |
222
+
223
+ **5-grams (Subword):**
224
+
225
+ | Rank | N-gram | Count |
226
+ |------|--------|-------|
227
+ | 1 | `_ s i ŏ h` | 9,171 |
228
+ | 2 | `_ g ô n g` | 9,066 |
229
+ | 3 | `s i ŏ h _` | 8,474 |
230
+ | 4 | `_ g ì _ s` | 8,113 |
231
+ | 5 | `i ŏ h _ c` | 7,536 |
232
 
233
 
234
  ### Key Findings
235
 
236
+ - **Best Perplexity:** 2-gram (subword) with 341
237
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
238
+ - **Coverage:** Top-1000 patterns cover ~42% of corpus
239
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
240
 
241
  ---
 
251
 
252
  | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
253
  |---------|---------|-------------|------------|------------------|-----------------|----------------|
254
+ | **1** | Word | 0.4885 | 1.403 | 4.74 | 29,717 | 51.2% |
255
+ | **1** | Subword | 0.3463 | 1.271 | 2.92 | 25,650 | 65.4% |
256
+ | **2** | Word | 0.3200 | 1.248 | 1.81 | 139,964 | 68.0% |
257
+ | **2** | Subword | 0.2749 | 1.210 | 1.79 | 74,833 | 72.5% |
258
+ | **3** | Word | 0.1204 | 1.087 | 1.23 | 250,754 | 88.0% |
259
+ | **3** | Subword | 0.2342 | 1.176 | 1.69 | 133,597 | 76.6% |
260
+ | **4** | Word | 0.0528 🏆 | 1.037 | 1.09 | 303,909 | 94.7% |
261
+ | **4** | Subword | 0.2293 | 1.172 | 1.54 | 225,426 | 77.1% |
262
 
263
  ### Generated Text Samples (Word-based)
264
 
 
266
 
267
  **Context Size 1:**
268
 
269
+ 1. `gì siŏh déng bĭng giàng guó guók găk hók ciŭ siŏh gă`
270
+ 2. `sê guók dâi chók sirens nièng gáu huòng 閩江公園 hŏk â dā̤`
271
+ 3. `siŏh cṳ̄ng ī céng mò̤ siū ăng iók hâng săng guók`
272
 
273
  **Context Size 2:**
274
 
275
+ 1. `gì siŏh ciáh gông gông`
276
+ 2. `siŏh ciáh mìng cŭk giū cê̤ṳ sìng bŏng ĭ sá̤ bò̤ dìng uòng 陳垣`
277
+ 3. `mī guók tennessee gì siŏh cṳ̄ng â̤ buŏi sèng dău cê mō̤ gì dâ̤ 140 ôi`
278
 
279
  **Context Size 3:**
280
 
281
+ 1. `gì siŏh ciáh gáu puái céng tūng puái nêng dêng sê siŏh ciáh bìng nièng tàu gĕ̤ng sê`
282
+ 2. `sê mī guók dâ̤ 19 êng gáu huòng 310 nièng gáu 314 nièng câi ôi nièng hô̤ tái`
283
  3. `siŏh ciáh gông gì gông`
284
 
285
  **Context Size 4:**
286
 
287
  1. `gì siŏh ciáh gông gì gông`
288
  2. `siŏh ciáh gông gì gông`
289
+ 3. `county sê mī guók georgia gì siŏh ciáh gông gì gông`
290
 
291
 
292
  ### Generated Text Samples (Subword-based)
 
295
 
296
  **Context Size 1:**
297
 
298
+ 1. `_7_g_sê-ngì-gì_s`
299
+ 2. `g_cīng_(ĭngṳ̄_sēn`
300
+ 3. `nerotŭ_sê_g_sê-m`
301
 
302
  **Context Size 2:**
303
 
304
+ 1. `ngiù_hâiu-gáu-sī“`
305
+ 2. `_guô-hô̤_gāi_gôngu`
306
+ 3. `g-gă_dìng_coung-h`
307
 
308
  **Context Size 3:**
309
 
310
+ 1. `ng-huá-hŏk-pŭng-cŭ`
311
+ 2. `ng_siàng_gâe̤ng_(埃及`
312
+ 3. `_gì_pàng,_ĭ_mĕ̤k-ci`
313
 
314
  **Context Size 4:**
315
 
316
+ 1. `_gì_siŏh_ciáh_dĭng_`
317
+ 2. `_sê_mī-guók-nè̤ng_nè̤`
318
+ 3. `ng_gék-cĭu_gó_ô_sié`
319
 
320
 
321
  ### Key Findings
322
 
323
  - **Best Predictability:** Context-4 (word) with 94.7% predictability
324
  - **Branching Factor:** Decreases with context size (more deterministic)
325
+ - **Memory Trade-off:** Larger contexts require more storage (225,426 contexts)
326
  - **Recommendation:** Context-3 or Context-4 for text generation
327
 
328
  ---
 
338
 
339
  | Metric | Value |
340
  |--------|-------|
341
+ | Vocabulary Size | 9,566 |
342
+ | Total Tokens | 470,049 |
343
+ | Mean Frequency | 49.14 |
344
  | Median Frequency | 3 |
345
+ | Frequency Std Dev | 396.77 |
346
 
347
  ### Most Common Words
348
 
349
  | Rank | Word | Frequency |
350
  |------|------|-----------|
351
+ | 1 | gì | 23,347 |
352
+ | 2 | sê | 14,101 |
353
+ | 3 | siŏh | 9,273 |
354
  | 4 | gông | 9,087 |
355
+ | 5 | guók | 8,556 |
356
+ | 6 | ciáh | 7,148 |
357
+ | 7 | nièng | 5,899 |
358
+ | 8 | ngṳ̄ | 5,273 |
359
+ | 9 | sié | 4,623 |
360
+ | 10 | gáu | 4,196 |
361
 
362
  ### Least Common Words (from vocabulary)
363
 
364
  | Rank | Word | Frequency |
365
  |------|------|-----------|
366
+ | 1 | 小天王國 | 2 |
367
+ | 2 | baidu | 2 |
368
+ | 3 | 宋在康 | 2 |
369
+ | 4 | woolridge | 2 |
370
+ | 5 | 六一路 | 2 |
371
+ | 6 | 神壇樹 | 2 |
372
+ | 7 | 신단수 | 2 |
373
+ | 8 | | 2 |
374
+ | 9 | kbo | 2 |
375
+ | 10 | 우주항공청 | 2 |
376
 
377
  ### Zipf's Law Analysis
378
 
379
  | Metric | Value |
380
  |--------|-------|
381
+ | Zipf Coefficient | 1.4007 |
382
+ | R² (Goodness of Fit) | 0.957225 |
383
  | Adherence Quality | **excellent** |
384
 
385
  ### Coverage Analysis
386
 
387
  | Top N Words | Coverage |
388
  |-------------|----------|
389
+ | Top 100 | 52.1% |
390
+ | Top 1,000 | 91.8% |
391
  | Top 5,000 | 98.0% |
392
  | Top 10,000 | 0.0% |
393
 
394
  ### Key Findings
395
 
396
+ - **Zipf Compliance:** R²=0.9572 indicates excellent adherence to Zipf's law
397
+ - **High Frequency Dominance:** Top 100 words cover 52.1% of corpus
398
+ - **Long Tail:** -434 words needed for remaining 100.0% coverage
399
 
400
  ---
401
  ## 5. Word Embeddings Evaluation
 
411
 
412
  ### 5.1 Cross-Lingual Alignment
413
 
414
+ ![Alignment Quality](visualizations/embedding_alignment_quality.png)
415
+
416
+ ![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
417
 
418
 
419
  ### 5.2 Model Comparison
420
 
421
  | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
422
  |-------|-----------|----------|------------------|---------------|----------------|
423
+ | **mono_32d** | 32 | 0.5099 | 0.4122 | N/A | N/A |
424
+ | **mono_64d** | 64 | 0.2128 | 0.3926 | N/A | N/A |
425
+ | **mono_128d** | 128 | 0.0308 | 0.3921 | N/A | N/A |
426
+ | **aligned_32d** | 32 | 0.5099 🏆 | 0.4223 | 0.0120 | 0.1260 |
427
+ | **aligned_64d** | 64 | 0.2128 | 0.3730 | 0.0280 | 0.2380 |
428
+ | **aligned_128d** | 128 | 0.0308 | 0.3804 | 0.0380 | 0.2160 |
429
 
430
  ### Key Findings
431
 
432
+ - **Best Isotropy:** aligned_32d with 0.5099 (more uniform distribution)
433
+ - **Semantic Density:** Average pairwise similarity of 0.3954. Lower values indicate better semantic separation.
434
+ - **Alignment Quality:** Aligned models achieve up to 3.8% R@1 in cross-lingual retrieval.
435
  - **Recommendation:** 128d aligned for best cross-lingual performance
436
 
437
  ---
438
  ## 6. Morphological Analysis (Experimental)
439
 
 
 
440
  This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
441
 
442
  ### 6.1 Productivity & Complexity
443
 
444
  | Metric | Value | Interpretation | Recommendation |
445
  |--------|-------|----------------|----------------|
446
+ | Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
447
+ | Idiomaticity Gap | **0.147** | Low formulaic content | - |
448
 
449
  ### 6.2 Affix Inventory (Productive Units)
450
 
 
459
 
460
  | Stem | Cohesion | Substitutability | Examples |
461
  |------|----------|------------------|----------|
462
+ | `áung` | 2.01x | 9 contexts | dáung, sáung, gáung |
463
+ | `âung` | 1.99x | 9 contexts | hâung, dâung, lâung |
464
+ | `iăng` | 1.88x | 7 contexts | siăng, hiăng, tiăng |
465
+ | `iāng` | 1.54x | 8 contexts | niāng, biāng, tiāng |
466
 
467
  ### 6.4 Affix Compatibility (Co-occurrence)
468
 
 
481
  ### 6.6 Linguistic Interpretation
482
 
483
  > **Automated Insight:**
484
+ The language Min Dong Chinese shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
485
 
486
  ---
487
  ## 7. Summary & Recommendations
 
493
  | Component | Recommended | Rationale |
494
  |-----------|-------------|-----------|
495
  | Tokenizer | **64k BPE** | Best compression (2.89x) |
496
+ | N-gram | **2-gram** | Lowest perplexity (341) |
497
  | Markov | **Context-4** | Highest predictability (94.7%) |
498
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
499
 
 
708
  ---
709
  *Generated by Wikilangs Models Pipeline*
710
 
711
+ *Report Date: 2026-01-03 20:07:11*
models/embeddings/aligned/cdo_128d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f0e088015b1d1c1d96463a5474f4a8111c8b8eeb2313e4d8bc4aca5e41fc56d
3
+ size 1030144124
models/embeddings/aligned/cdo_128d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "cdo", "dim": 128, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/cdo_128d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37e55fe9fe01120afbbdfa1bba712d903bc5772106302e13b181df94ae3447e2
3
+ size 65664
models/embeddings/aligned/cdo_128d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "cdo",
3
+ "dimension": 128,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 1008,
7
+ "vocab_size": 5890
8
+ }
models/embeddings/aligned/cdo_32d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86a7dce9574a1b349e8220d6f822717130a1b46d2b0974b780a9d477774160c8
3
+ size 257620604
models/embeddings/aligned/cdo_32d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "cdo", "dim": 32, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/cdo_32d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74adfe83fc67339561073ef092411ca4c2419545d4da63d7e3027e04aebe1f1a
3
+ size 4224
models/embeddings/aligned/cdo_32d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "cdo",
3
+ "dimension": 32,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 1008,
7
+ "vocab_size": 5890
8
+ }
models/embeddings/aligned/cdo_64d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff1b4674bfe38edab895e88af4342b16a3f9789a7c958abf6e47054845cd6aa4
3
+ size 515128444
models/embeddings/aligned/cdo_64d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "cdo", "dim": 64, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/cdo_64d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22c599ba6e1acd33d7d4166b18a164e7368ae4a2067a5d4caa57f682863aa44c
3
+ size 16512
models/embeddings/aligned/cdo_64d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "cdo",
3
+ "dimension": 64,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 1008,
7
+ "vocab_size": 5890
8
+ }
models/embeddings/monolingual/cdo_128d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c366500bae149b287db9cc735aecf709e56990e0f1dd949aa01f743c5b542bf
3
- size 1030106435
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f0e088015b1d1c1d96463a5474f4a8111c8b8eeb2313e4d8bc4aca5e41fc56d
3
+ size 1030144124
models/embeddings/monolingual/cdo_128d_metadata.json CHANGED
@@ -11,5 +11,5 @@
11
  "encoding_method": "rope",
12
  "dim": 128
13
  },
14
- "vocab_size": 5854
15
  }
 
11
  "encoding_method": "rope",
12
  "dim": 128
13
  },
14
+ "vocab_size": 5890
15
  }
models/embeddings/monolingual/cdo_32d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ee34c3106d7321309452b983807ebe84c3a153af734404b31b03ac0f6076d5c
3
- size 257610563
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86a7dce9574a1b349e8220d6f822717130a1b46d2b0974b780a9d477774160c8
3
+ size 257620604
models/embeddings/monolingual/cdo_32d_metadata.json CHANGED
@@ -11,5 +11,5 @@
11
  "encoding_method": "rope",
12
  "dim": 32
13
  },
14
- "vocab_size": 5854
15
  }
 
11
  "encoding_method": "rope",
12
  "dim": 32
13
  },
14
+ "vocab_size": 5890
15
  }
models/embeddings/monolingual/cdo_64d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aad5fc0dca0e268941025dbedc1373061213d3a2602f82f5fb36a6a8ac4c5600
3
- size 515109187
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff1b4674bfe38edab895e88af4342b16a3f9789a7c958abf6e47054845cd6aa4
3
+ size 515128444
models/embeddings/monolingual/cdo_64d_metadata.json CHANGED
@@ -11,5 +11,5 @@
11
  "encoding_method": "rope",
12
  "dim": 64
13
  },
14
- "vocab_size": 5854
15
  }
 
11
  "encoding_method": "rope",
12
  "dim": 64
13
  },
14
+ "vocab_size": 5890
15
  }
models/subword_markov/cdo_markov_ctx1_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41c23bda939ee89e9ea22c7d220f77ade97cf29b5706a8ccea6a5a1649c35d5c
3
- size 588804
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3da6052e82e4c697f086d6afaa06670505468ff51dcf1a7340b522bf070015e6
3
+ size 592109
models/subword_markov/cdo_markov_ctx1_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "cdo",
5
- "unique_contexts": 25622,
6
- "total_transitions": 2209713
7
  }
 
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "cdo",
5
+ "unique_contexts": 25650,
6
+ "total_transitions": 2221841
7
  }
models/subword_markov/cdo_markov_ctx2_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:635b5359d40f1fb9f72a9835efb0cf9b06c7ece40db0ae41691aa56bcf42a68f
3
- size 1373275
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bb9ad24e188fc52306a7642dd496ab7d8532ccf33c025d3f7d1c4af1d6ff510
3
+ size 1367425
models/subword_markov/cdo_markov_ctx2_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "cdo",
5
- "unique_contexts": 74780,
6
- "total_transitions": 2199286
7
  }
 
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "cdo",
5
+ "unique_contexts": 74833,
6
+ "total_transitions": 2211413
7
  }
models/subword_markov/cdo_markov_ctx3_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:edbc276446953ce254374098ddc144b562d74f64b6dc6b037712625c3474bcc7
3
- size 2410456
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0886f92da1dc62701890944aa0b1e24c9dcf37f74793e507af30e30779e26484
3
+ size 2419564
models/subword_markov/cdo_markov_ctx3_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "cdo",
5
- "unique_contexts": 133665,
6
- "total_transitions": 2188859
7
  }
 
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "cdo",
5
+ "unique_contexts": 133597,
6
+ "total_transitions": 2200985
7
  }
models/subword_markov/cdo_markov_ctx4_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:73b83a95594a9de41178ee150977a694d8fe0fc9528952db1f4037278640135b
3
- size 3940722
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f074b6b52183d155642e3893513a5b825b81d8e1f3470db88e8b5a1d13cc8a28
3
+ size 3936769
models/subword_markov/cdo_markov_ctx4_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "cdo",
5
- "unique_contexts": 225577,
6
- "total_transitions": 2178432
7
  }
 
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "cdo",
5
+ "unique_contexts": 225426,
6
+ "total_transitions": 2190557
7
  }
models/subword_ngram/cdo_2gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:07438dbbdf7df68795d2567c55c6704ba9e2aa37a1e974f3796653e4ee46d715
3
- size 93187
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd4b2592803a4c0a58278e42cfcc15e90b5e81feb82f8a0d64046e6d145b1ab8
3
+ size 93344
models/subword_ngram/cdo_2gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "cdo",
5
- "unique_ngrams": 6912,
6
- "total_ngrams": 2209713
7
  }
 
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "cdo",
5
+ "unique_ngrams": 6920,
6
+ "total_ngrams": 2221841
7
  }
models/subword_ngram/cdo_3gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9bc12fdb56f63acfec879929e218e1b08b69ed6bc6caaf23de5405818299a15a
3
- size 290607
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f80cd1c950fefb1ab46a416d8d95323ac4dc6c0e6c4912368a62bf620aa23f96
3
+ size 290909
models/subword_ngram/cdo_3gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "cdo",
5
- "unique_ngrams": 21000,
6
- "total_ngrams": 2199286
7
  }
 
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "cdo",
5
+ "unique_ngrams": 21022,
6
+ "total_ngrams": 2211413
7
  }
models/subword_ngram/cdo_4gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7bbf6e008db71c8cab5f01f7b94dece221cc12e30b6c0bbde60267e28ad3d1b
3
- size 893634
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df22974458699648ecdec9e69b43207c159d18dee62b39b3567352430145f9ac
3
+ size 904230
models/subword_ngram/cdo_4gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "cdo",
5
- "unique_ngrams": 69193,
6
- "total_ngrams": 2188859
7
  }
 
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "cdo",
5
+ "unique_ngrams": 69190,
6
+ "total_ngrams": 2200985
7
  }
models/subword_ngram/cdo_5gram_subword.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa71ec0b3930bcdbc6ad1d120e08effe3a41602937f54280ae259ede5e07dd4a
3
+ size 1399844
models/subword_ngram/cdo_5gram_subword_metadata.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "n": 5,
3
+ "variant": "subword",
4
+ "language": "cdo",
5
+ "unique_ngrams": 106632,
6
+ "total_ngrams": 2190557
7
+ }
models/tokenizer/cdo_tokenizer_32k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4ea04b7b5ac9eb6c0c47f8743b9ec573da4a66b11329b62767916c896096283
3
- size 659402
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1edb0f78c51e82af1aa3bd0048f32afb41dc8b1283e02f6ddf91540c4862694d
3
+ size 659360
models/tokenizer/cdo_tokenizer_32k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/cdo_tokenizer_64k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:543fc6291e9f68ca726ae3c62da2bf8a9f2e8964f61ac8c8f47e7620b30680d4
3
- size 1252522
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67f7d92ddf85241d2e9badbee50f3b13e912420e37030ac222e16cc3f0c3a97e
3
+ size 1253153
models/tokenizer/cdo_tokenizer_64k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/vocabulary/cdo_vocabulary.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13d428e7f371fae4d1817f7fd0cad404bb08c4eebfeba6b19d2159ec0858966d
3
- size 161421
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:372bfa68c5bd35d0af5617452462cbedcbf3c7aacdf2835c65228f95b19add52
3
+ size 162642
models/vocabulary/cdo_vocabulary_metadata.json CHANGED
@@ -1,17 +1,17 @@
1
  {
2
  "language": "cdo",
3
- "vocabulary_size": 9559,
4
  "variant": "full",
5
  "statistics": {
6
- "type_token_ratio": 0.06153581253100364,
7
  "coverage": {
8
- "top_100": 0.4998278145152364,
9
- "top_1000": 0.8788593121599849,
10
- "top_5000": 0.9388721030817102,
11
- "top_10000": 0.9589624594646672
12
  },
13
- "hapax_count": 20461,
14
- "hapax_ratio": 0.6815789473684211,
15
- "total_documents": 10427
16
  }
17
  }
 
1
  {
2
  "language": "cdo",
3
+ "vocabulary_size": 9566,
4
  "variant": "full",
5
  "statistics": {
6
+ "type_token_ratio": 0.06128859968851162,
7
  "coverage": {
8
+ "top_100": 0.49938436197884817,
9
+ "top_1000": 0.8793349478542365,
10
+ "top_5000": 0.9390661056614236,
11
+ "top_10000": 0.9590967652502915
12
  },
13
+ "hapax_count": 20499,
14
+ "hapax_ratio": 0.6818227174455347,
15
+ "total_documents": 10428
16
  }
17
  }
models/word_markov/cdo_markov_ctx1_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7c52cf701393a62ef55eb76e071bcc146cb575dcba8d7de8f629c6f59a0ffa9
3
- size 1378282
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a594cf868f038a4349e56316d6ace7758633c37de5bff30cb3e48c189510cd01
3
+ size 1374994
models/word_markov/cdo_markov_ctx1_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "cdo",
5
- "unique_contexts": 29670,
6
- "total_transitions": 477419
7
  }
 
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "cdo",
5
+ "unique_contexts": 29717,
6
+ "total_transitions": 480120
7
  }
models/word_markov/cdo_markov_ctx2_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e2d34224b73bb7d088188c6354a0a9b2ed3b721bfecbe369bd09f27bc4bf8eb
3
- size 3117924
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6174989127c033bc7a8052470160130dcaaa4c7286e94049aa162170de1a6bd4
3
+ size 3125934
models/word_markov/cdo_markov_ctx2_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "word",
4
  "language": "cdo",
5
- "unique_contexts": 139308,
6
- "total_transitions": 466992
7
  }
 
2
  "context_size": 2,
3
  "variant": "word",
4
  "language": "cdo",
5
+ "unique_contexts": 139964,
6
+ "total_transitions": 469692
7
  }
models/word_markov/cdo_markov_ctx3_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:751ecd29db06623ef008eb11136a6de55e6177de5d5106326042cf155bf67096
3
- size 4876570
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71ce6d10ada60379f7a9a20207ebf75b18557cd4f0f56e6350995d6c9b418296
3
+ size 4909036
models/word_markov/cdo_markov_ctx3_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "word",
4
  "language": "cdo",
5
- "unique_contexts": 249012,
6
- "total_transitions": 456565
7
  }
 
2
  "context_size": 3,
3
  "variant": "word",
4
  "language": "cdo",
5
+ "unique_contexts": 250754,
6
+ "total_transitions": 459264
7
  }
models/word_markov/cdo_markov_ctx4_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be6fc8ceb5ad611ae4227c2c3e9eb01458860d29280986707129fb49af2ed1fc
3
- size 5983737
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca89a77dda70cd4580eaecc3797e7cb056289301d3f778429f14c4af0137ae4b
3
+ size 6024496
models/word_markov/cdo_markov_ctx4_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 4,
3
  "variant": "word",
4
  "language": "cdo",
5
- "unique_contexts": 301670,
6
- "total_transitions": 446138
7
  }
 
2
  "context_size": 4,
3
  "variant": "word",
4
  "language": "cdo",
5
+ "unique_contexts": 303909,
6
+ "total_transitions": 448836
7
  }