omarkamali commited on Jan 3

Commit

3d0ac29

verified ·

1 Parent(s): 8164949

Upload all models and assets for am (latest)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +185 -148
models/embeddings/aligned/am_128d.bin +3 -0
models/embeddings/aligned/am_128d.meta.json +1 -0
models/embeddings/aligned/am_128d.projection.npy +3 -0
models/embeddings/aligned/am_128d_metadata.json +8 -0
models/embeddings/aligned/am_32d.bin +3 -0
models/embeddings/aligned/am_32d.meta.json +1 -0
models/embeddings/aligned/am_32d.projection.npy +3 -0
models/embeddings/aligned/am_32d_metadata.json +8 -0
models/embeddings/aligned/am_64d.bin +3 -0
models/embeddings/aligned/am_64d.meta.json +1 -0
models/embeddings/aligned/am_64d.projection.npy +3 -0
models/embeddings/aligned/am_64d_metadata.json +8 -0
models/embeddings/monolingual/am_128d.bin +2 -2
models/embeddings/monolingual/am_128d_metadata.json +1 -1
models/embeddings/monolingual/am_32d.bin +2 -2
models/embeddings/monolingual/am_32d_metadata.json +1 -1
models/embeddings/monolingual/am_64d.bin +2 -2
models/embeddings/monolingual/am_64d_metadata.json +1 -1
models/subword_markov/am_markov_ctx1_subword.parquet +2 -2
models/subword_markov/am_markov_ctx1_subword_metadata.json +2 -2
models/subword_markov/am_markov_ctx2_subword.parquet +2 -2
models/subword_markov/am_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/am_markov_ctx3_subword.parquet +2 -2
models/subword_markov/am_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/am_markov_ctx4_subword.parquet +2 -2
models/subword_markov/am_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/am_2gram_subword.parquet +2 -2
models/subword_ngram/am_2gram_subword_metadata.json +2 -2
models/subword_ngram/am_3gram_subword.parquet +2 -2
models/subword_ngram/am_3gram_subword_metadata.json +2 -2
models/subword_ngram/am_4gram_subword.parquet +2 -2
models/subword_ngram/am_4gram_subword_metadata.json +2 -2
models/subword_ngram/am_5gram_subword.parquet +3 -0
models/subword_ngram/am_5gram_subword_metadata.json +7 -0
models/tokenizer/am_tokenizer_16k.model +2 -2
models/tokenizer/am_tokenizer_16k.vocab +0 -0
models/tokenizer/am_tokenizer_32k.model +2 -2
models/tokenizer/am_tokenizer_32k.vocab +0 -0
models/tokenizer/am_tokenizer_64k.model +2 -2
models/tokenizer/am_tokenizer_64k.vocab +0 -0
models/tokenizer/am_tokenizer_8k.model +2 -2
models/tokenizer/am_tokenizer_8k.vocab +0 -0
models/vocabulary/am_vocabulary.parquet +2 -2
models/vocabulary/am_vocabulary_metadata.json +8 -8
models/word_markov/am_markov_ctx1_word.parquet +2 -2
models/word_markov/am_markov_ctx1_word_metadata.json +2 -2
models/word_markov/am_markov_ctx2_word.parquet +2 -2
models/word_markov/am_markov_ctx2_word_metadata.json +2 -2

.gitattributes CHANGED Viewed

@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text

 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
+visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 language: am
-language_name: AM
 language_family: semitic_ethiopic
 tags:
   - wikilangs
@@ -10,11 +10,21 @@ tags:
   - n-gram
   - markov
   - wikipedia
   - monolingual
   - family-semitic_ethiopic
 license: mit
 library_name: wikilangs
-pipeline_tag: feature-extraction
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
-    value: 3.287
   - name: best_isotropy
     type: isotropy
-    value: 0.9163
   - name: vocabulary_size
     type: vocab
     value: 0
 generated: 2026-01-03
 ---
-# AM - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
-This repository contains NLP models trained and evaluated by Wikilangs, specifically on **AM** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
@@ -60,7 +70,7 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Morphological Analysis (Experimental)](#6-morphological-analysis)
 - [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -80,47 +90,47 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 2.436x | 2.44 | 0.1557% | 683,952 |
-| **16k** | 2.745x | 2.75 | 0.1754% | 607,060 |
-| **32k** | 3.031x | 3.03 | 0.1937% | 549,802 |
-| **64k** | 3.287x 🏆 | 3.29 | 0.2101% | 506,938 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `አዋሳ ከነማ ስታዲየም በአዋሳ፣ ኢትዮጵያ የሚገኝ ስታዲዮም ነው። ፳፭ ሺህ ሰዎችን መያዝ ሲችል የአዋሳ ከተማ የእግር ኳስ ክለብ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁አዋ ሳ ▁ከነ ማ ▁ስታዲየም ▁በአ ዋ ሳ፣ ▁ኢትዮጵያ ▁የሚገኝ ... (+25 more)` | 35 |
-| 16k | `▁አዋሳ ▁ከነ ማ ▁ስታዲየም ▁በአ ዋ ሳ፣ ▁ኢትዮጵያ ▁የሚገኝ ▁ስታ ... (+22 more)` | 32 |
-| 32k | `▁አዋሳ ▁ከነማ ▁ስታዲየም ▁በአ ዋ ሳ፣ ▁ኢትዮጵያ ▁የሚገኝ ▁ስታ ዲዮ ... (+20 more)` | 30 |
-| 64k | `▁አዋሳ ▁ከነማ ▁ስታዲየም ▁በአዋ ሳ፣ ▁ኢትዮጵያ ▁የሚገኝ ▁ስታ ዲዮ ም ... (+19 more)` | 29 |
-**Sample 2:** `የዝንጀሮ ስብሰባ በውሻ ጩኸት ይበተናል የአማርኛ ምሳሌ ነው። የዝንጀሮ ስብሰባ በውሻ ጩኸት ይበተናል የአማርኛ ምሳሌ ነው። ትር...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁የዝ ንጀሮ ▁ስብሰባ ▁በው ሻ ▁ ጩ ኸ ት ▁ይበ ... (+29 more)` | 39 |
-| 16k | `▁የዝ ንጀሮ ▁ስብሰባ ▁በው ሻ ▁ጩ ኸት ▁ይበ ተ ናል ... (+25 more)` | 35 |
-| 32k | `▁የዝንጀሮ ▁ስብሰባ ▁በው ሻ ▁ጩ ኸት ▁ይበ ተናል ▁የ���ማርኛ ▁ምሳሌ ... (+21 more)` | 31 |
-| 64k | `▁የዝንጀሮ ▁ስብሰባ ▁በውሻ ▁ጩኸት ▁ይበ ተናል ▁የአማርኛ ▁ምሳሌ ▁ነው። ▁የዝንጀሮ ... (+17 more)` | 27 |
-**Sample 3:** `የሐረሪ ብሔራዊ ሊግ የኢትዮጵያ ፖለቲካ ፓርቲ ነው። ዓላማ ሊቀመንበር ታሪክ መደብ: በምርጫ የተሳተፉ የኢትዮጵያ ፓርቲዎች መደብ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁የሐ ረ ሪ ▁ብሔራዊ ▁ሊግ ▁የኢትዮጵያ ▁ፖለቲካ ▁ፓርቲ ▁ነው። ▁ዓላማ ... (+13 more)` | 23 |
-| 16k | `▁የሐ ረሪ ▁ብሔራዊ ▁ሊግ ▁የኢትዮጵያ ▁ፖለቲካ ▁ፓርቲ ▁ነው። ▁ዓላማ ▁ሊቀመንበር ... (+12 more)` | 22 |
-| 32k | `▁የሐረሪ ▁ብሔራዊ ▁ሊግ ▁የኢትዮጵያ ▁ፖለቲካ ▁ፓርቲ ▁ነው። ▁ዓላማ ▁ሊቀመንበር ▁ታሪክ ... (+11 more)` | 21 |
-| 64k | `▁የሐረሪ ▁ብሔራዊ ▁ሊግ ▁የኢትዮጵያ ▁ፖለቲካ ▁ፓርቲ ▁ነው። ▁ዓላማ ▁ሊቀመንበር ▁ታሪክ ... (+11 more)` | 21 |
 ### Key Findings
-- **Best Compression:** 64k achieves 3.287x compression
-- **Lowest UNK Rate:** 8k with 0.1557% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -137,12 +147,14 @@ Below are sample sentences tokenized with each vocabulary size:
 | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
 |--------|---------|------------|---------|----------------|------------------|-------------------|
-| **2-gram** | Word | 8,988 | 13.13 | 27,901 | 19.7% | 39.7% |
-| **2-gram** | Subword | 2,079 🏆 | 11.02 | 23,804 | 34.0% | 69.2% |
-| **3-gram** | Word | 9,944 | 13.28 | 35,714 | 22.1% | 40.5% |
-| **3-gram** | Subword | 19,139 | 14.22 | 153,027 | 11.8% | 35.5% |
-| **4-gram** | Word | 36,744 | 15.17 | 90,792 | 13.9% | 25.8% |
-| **4-gram** | Subword | 94,777 | 16.53 | 549,996 | 6.6% | 19.5% |
 ### Top 5 N-grams by Size
@@ -150,68 +162,88 @@ Below are sample sentences tokenized with each vocabulary size:
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `ዓ ም` | 8,324 |
-| 2 | `ምሳሌ ነው` | 5,625 |
-| 3 | `የአማርኛ ምሳሌ` | 5,563 |
-| 4 | `እ ኤ` | 4,026 |
-| 5 | `ኤ አ` | 3,961 |
 **3-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `የአማርኛ ምሳሌ ነው` | 5,563 |
-| 2 | `እ ኤ አ` | 3,908 |
 | 3 | `ምሳሌ ነው ትርጉሙ` | 3,454 |
 | 4 | `መደብ ተረትና ምሳሌ` | 3,051 |
-| 5 | `ነ�� ትርጉሙ መደብ` | 2,533 |
 **4-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
 | 1 | `የአማርኛ ምሳሌ ነው ትርጉሙ` | 3,452 |
-| 2 | `ምሳሌ ነው ትርጉሙ መደብ` | 2,533 |
-| 3 | `ትርጉሙ መደብ ያልተተረጎመ ምሳሌ` | 2,118 |
-| 4 | `ነው ትርጉሙ መደብ ያልተተረጎመ` | 2,114 |
 | 5 | `ምሳሌ መደብ ተረትና ምሳሌ` | 1,854 |
 **2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `_ የ` | 170,716 |
-| 2 | `ት _` | 145,051 |
-| 3 | `_ በ` | 140,839 |
-| 4 | `ን _` | 132,909 |
-| 5 | `_ አ` | 113,769 |
 **3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `_ እ ን` | 32,319 |
-| 2 | `_ ነ ው` | 26,511 |
-| 3 | `ው ። _` | 24,155 |
-| 4 | `_ እ ና` | 23,843 |
-| 5 | `እ ና _` | 22,397 |
 **4-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `_ እ ና _` | 22,267 |
-| 2 | `_ ነ ው ።` | 19,378 |
-| 3 | `ነ ው ። _` | 18,922 |
-| 4 | `_ እ ን ደ` | 13,836 |
-| 5 | `_ ላ ይ _` | 12,924 |
 ### Key Findings
-- **Best Perplexity:** 2-gram (subword) with 2,079
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~19% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -227,14 +259,14 @@ Below are sample sentences tokenized with each vocabulary size:
 | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
 |---------|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | Word | 0.7502 | 1.682 | 4.80 | 236,353 | 25.0% |
-| **1** | Subword | 1.2235 | 2.335 | 17.52 | 2,854 | 0.0% |
-| **2** | Word | 0.1468 | 1.107 | 1.28 | 1,130,961 | 85.3% |
-| **2** | Subword | 1.0397 | 2.056 | 6.98 | 49,981 | 0.0% |
-| **3** | Word | 0.0355 | 1.025 | 1.06 | 1,446,616 | 96.4% |
-| **3** | Subword | 0.6354 | 1.553 | 3.36 | 348,535 | 36.5% |
-| **4** | Word | 0.0159 🏆 | 1.011 | 1.02 | 1,520,994 | 98.4% |
-| **4** | Subword | 0.4515 | 1.367 | 2.14 | 1,171,344 | 54.9% |
 ### Generated Text Samples (Word-based)
@@ -242,27 +274,27 @@ Below are text samples generated from each word-based Markov chain model:
 **Context Size 1:**
-1. `ነው በጥሩ አስተዳደር የህዝብ ልውውጥ ኮሚሽን ሕንፃ በሥነ ሕንጻ ተጠናቆ ክፍት አረግንጓዴ ከጂዮርጂያ በእ አ የሆነ`
-2. `እና ሲሞት ቤተሰቦቹ ጋር ቢኮብለል ወይም ቀበሮ ዝርያ ያለበት ቦታ 4 5 14 847 ቅጥር የተለጠፈ`
-3. `ላይ መስኮት እና ከደቡብ ህንድ ቀጥሎ ዓ ም ቀዳማዊ ኃይለ ሥላሴ ለአፍሪካ ገሞጂማ ሣርማ ቦታዎች ይቀመጡ`
 **Context Size 2:**
-1. `ዓ ም አስቀድሞ ወይም ዓ ም የአርጡስ ወንድም 4 ካርል 663 669 ዓ ም ሁላቸው ሲስማሙ ወደ`
-2. `ምሳሌ ነው ጀርባዬን እከክልኝ ለኔ ራቀኝ የአማርኛ ምሳሌ ነው ዝርክርክ ከወንፊት የባሰ ዝክዝክ የአማርኛ ምሳሌ ነው ትርጉሙ`
-3. `የአማርኛ ምሳሌ ነው የምትጠላው ሰው ፈሱ እሆዱ ውስጥ ሳለ ኔሽን ኦፍ ኢስላም ጋር ያለው ዝምድና ግልጽ ነው`
 **Context Size 3:**
-1. `የአማርኛ ምሳሌ ነው ትርጉሙ መደብ ያልተተረጎመ ምሳሌ መደብ ተረትና ምሳሌ ቁና ሰፋች`
-2. `እ ኤ አ በ በሂትለር ተጽዕኖ ሙሶሎኒ በጣሊያን ፀረ ሴማዊ የዘር ህጎች እንዲፀድቁ ደገፈ በመጋቢት ጀርመን ቼኮዝሎቫኪያን ከቀላቀለች`
-3. `ምሳሌ ነው ትርጉሙ መደብ ያልተተረጎመ ምሳሌ መደብ ተረትና ምሳሌ ምግባር ሳይኖር ስም እንደማለት ነዉ`
 **Context Size 4:**
-1. `የአማርኛ ምሳሌ ነው ትርጉሙ ሁለቱም አያዋጡም መደብ ተረትና ምሳሌ`
-2. `ምሳሌ ነው ትርጉሙ መደብ ያልተተረጎመ ምሳሌ መደብ ተረትና ምሳሌ መደብ ፈሊጣዊ አነጋገር መደብ ተረትና ምሳሌ ቁና ሰፋች`
-3. `ነው ትርጉሙ መደብ ያልተተረጎመ ምሳሌ መደብ ተረትና ምሳሌ ሴት ሁሉን ቻይ ናት`
 ### Generated Text Samples (Subword-based)
@@ -271,34 +303,34 @@ Below are text samples generated from each subword-based Markov chain model:
 **Context Size 1:**
-1. `_አንግብ፣_ለማር_(“ሀንን`
-2. `ን_(dicole_ገደቡድ_ሞ`
-3. `ት_ቅ_ሓምበላስድ_ጋ_ይለት`
 **Context Size 2:**
-1. `_የኢትዮጵያ_አፖሎኛ_,00_`
-2. `ት_ተመለሰብን_ኣሉ።_የባህር`
-3. `_በዘመዴ_ሲፀድቅ_በተአምስተ`
 **Context Size 3:**
-1. `_እንደ_ማርኮ_ከተማ_እና_ግሮ`
-2. `_ነው።_ከተማው_ባልሞራል_እን`
-3. `ው።_ኮምፕዩተራይዝ_ካሊፎርኒያ`
 **Context Size 4:**
-1. `_እና_ማከማቸት_ጉዳት_ቁጭ_ብለ`
-2. `_ነው።_ዋጋው_ወቅት_የመጀመሪያ`
-3. `ነው።_ትርጉሙ_አንቴና_ይፈሳሉ፡`
 ### Key Findings
 - **Best Predictability:** Context-4 (word) with 98.4% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (1,171,344 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -314,48 +346,48 @@ Below are text samples generated from each subword-based Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 99,716 |
-| Total Tokens | 1,636,892 |
-| Mean Frequency | 16.42 |
 | Median Frequency | 3 |
-| Frequency Std Dev | 174.41 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | ነው | 26,460 |
-| 2 | እና | 22,392 |
-| 3 | ላይ | 13,250 |
-| 4 | ምሳሌ | 11,607 |
-| 5 | ውስጥ | 9,622 |
-| 6 | ነበር | 9,005 |
-| 7 | ዓ | 8,679 |
-| 8 | ም | 8,584 |
-| 9 | ወደ | 8,446 |
-| 10 | እንደ | 6,776 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | ቫሊን | 2 |
-| 2 | ግሎቡላር | 2 |
-| 3 | ኢንዛይሞች | 2 |
-| 4 | የማከማቻ | 2 |
-| 5 | ለph | 2 |
-| 6 | ግብረመልሶችን | 2 |
-| 7 | behi | 2 |
-| 8 | ቤሂ | 2 |
-| 9 | goli | 2 |
-| 10 | ክሩድስ | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 0.9367 |
-| R² (Goodness of Fit) | 0.995214 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
@@ -371,7 +403,7 @@ Below are text samples generated from each subword-based Markov chain model:
 - **Zipf Compliance:** R²=0.9952 indicates excellent adherence to Zipf's law
 - **High Frequency Dominance:** Top 100 words cover 22.7% of corpus
-- **Long Tail:** 89,716 words needed for remaining 25.1% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -387,37 +419,40 @@ Below are text samples generated from each subword-based Markov chain model:
 ### 5.1 Cross-Lingual Alignment
-> *Note: Multilingual alignment visualization not available for this language.*
 ### 5.2 Model Comparison
 | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
 |-------|-----------|----------|------------------|---------------|----------------|
-| **mono_32d** | 32 | 0.9125 | 0.3250 | N/A | N/A |
-| **mono_64d** | 64 | 0.9163 🏆 | 0.2292 | N/A | N/A |
-| **mono_128d** | 128 | 0.8535 | 0.1745 | N/A | N/A |
 ### Key Findings
-- **Best Isotropy:** mono_64d with 0.9163 (more uniform distribution)
-- **Semantic Density:** Average pairwise similarity of 0.2429. Lower values indicate better semantic separation.
-- **Alignment Quality:** No aligned models evaluated in this run.
 - **Recommendation:** 128d aligned for best cross-lingual performance
 ---
 ## 6.  Morphological Analysis (Experimental)
-> ⚠️ **Warning:** This language shows low morphological productivity. The statistical signals used for this analysis may be noisy or less reliable than for morphologically rich languages.
 This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
 ### 6.1 Productivity & Complexity
 | Metric | Value | Interpretation | Recommendation |
 |--------|-------|----------------|----------------|
-| Productivity Index | **0.000** | Low morphological productivity | ⚠️ Likely unreliable |
-| Idiomaticity Gap | **-1.000** | Low formulaic content | - |
 ### 6.2 Affix Inventory (Productive Units)
@@ -432,18 +467,18 @@ Bound stems are high-frequency subword units that are semantically cohesive but
 | Stem | Cohesion | Substitutability | Examples |
 |------|----------|------------------|----------|
-| `እንደሚ` | 2.46x | 153 contexts | እንደሚሹ, እንደሚል, እንደሚሉ |
-| `ርስቲያ` | 2.48x | 60 contexts | ክርስቲያ, ክርስቲያኗ, ክርስቲያኖ |
-| `ትዮጵያ` | 2.23x | 57 contexts | እትዮጵያ, ኢትዮጵያ, ኢትዮጵያው |
-| `ግዚአብ` | 2.73x | 24 contexts | እግዚአብሔር, እግዚአብሐር, እግዚአብሄር |
-| `ኢትዮጵ` | 2.24x | 46 contexts | ኢትዮጵያ, ኢትዮጵያው, ኢትዮጵስት |
-| `መንግሥ` | 2.21x | 46 contexts | መንግሥተ, መንግሥት, መንግሥቱ |
-| `መንግስ` | 2.16x | 48 contexts | መንግስት, መንግስተ, መንግስቱ |
-| `ፈረንሳ` | 2.33x | 34 contexts | ፈረንሳዊ, ፈረንሳይ, በፈረንሳዩ |
-| `አስተዳ` | 2.33x | 33 contexts | አስተዳዳሪ, አስተዳደጓ, አስተዳደረ |
-| `እንግሊ` | 2.05x | 53 contexts | እንግሊዝ, እንግሊዙ, እንግሊኛ |
-| `tion` | 2.82x | 17 contexts | nation, action, section |
-| `ጀመሪያ` | 2.28x | 33 contexts | መጀመሪያ, ለመጀመሪያ, መጀመሪያው |
 ### 6.4 Affix Compatibility (Co-occurrence)
@@ -462,7 +497,9 @@ Using **Recursive Hierarchical Substitutability**, we decompose complex words in
 ### 6.6 Linguistic Interpretation
 > **Automated Insight:**
-The language AM appears to be more isolating or has a highly fixed vocabulary. Word-level models perform nearly as well as subword models, indicating fewer productive morphological processes.
 ---
 ## 7. Summary & Recommendations
@@ -474,7 +511,7 @@ The language AM appears to be more isolating or has a highly fixed vocabulary. W
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
 | Tokenizer | **64k BPE** | Best compression (3.29x) |
-| N-gram | **2-gram** | Lowest perplexity (2,079) |
 | Markov | **Context-4** | Highest predictability (98.4%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
@@ -689,4 +726,4 @@ MIT License - Free for academic and commercial use.
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2026-01-03 05:13:17*

 ---
 language: am
+language_name: Amharic
 language_family: semitic_ethiopic
 tags:
   - wikilangs
   - n-gram
   - markov
   - wikipedia
+  - feature-extraction
+  - sentence-similarity
+  - tokenization
+  - n-grams
+  - markov-chain
+  - text-mining
+  - fasttext
+  - babelvec
+  - vocabulous
+  - vocabulary
   - monolingual
   - family-semitic_ethiopic
 license: mit
 library_name: wikilangs
+pipeline_tag: text-generation
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
+    value: 3.293
   - name: best_isotropy
     type: isotropy
+    value: 0.9137
   - name: vocabulary_size
     type: vocab
     value: 0
 generated: 2026-01-03
 ---
+# Amharic - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
+This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Amharic** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
 - [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 2.438x | 2.44 | 0.1566% | 682,453 |
+| **16k** | 2.748x | 2.75 | 0.1765% | 605,553 |
+| **32k** | 3.035x | 3.04 | 0.1950% | 548,316 |
+| **64k** | 3.293x 🏆 | 3.29 | 0.2116% | 505,279 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `እኔ እውነት እናገራለሁ ሌላውን አስኮንናለሁ የአማርኛ ምሳሌ ነው። ትርጉሙ መደብ: ያልተተረጎመ ምሳሌ`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁እኔ ▁እውነት ▁እና ገራ ለሁ ▁ሌላ ውን ▁አስ ኮ ንና ... (+9 more)` | 19 |
+| 16k | `▁እኔ ▁እውነት ▁እና ገራ ለሁ ▁ሌላውን ▁አስ ኮ ንና ለሁ ... (+8 more)` | 18 |
+| 32k | `▁እኔ ▁እውነት ▁እናገራ ለሁ ▁ሌላውን ▁አስ ኮ ንና ለሁ ▁የአማርኛ ... (+7 more)` | 17 |
+| 64k | `▁እኔ ▁እውነት ▁እናገራለሁ ▁ሌላውን ▁አስ ኮንና ለሁ ▁የአማርኛ ▁ምሳሌ ▁ነው። ... (+5 more)` | 15 |
+**Sample 2:** `እንኳን ለገንፎ ለሙቅም አልደነግጥ የአማርኛ ምሳሌ ነው። ትርጉሙ መደብ: ያልተተረጎመ ምሳሌ`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁እንኳን ▁ለ ገን ፎ ▁ለ ሙ ቅም ▁አል ደ ነግ ... (+9 more)` | 19 |
+| 16k | `▁እንኳን ▁ለ ገን ፎ ▁ለሙ ቅም ▁አል ደነግ ጥ ▁የአማርኛ ... (+7 more)` | 17 |
+| 32k | `▁እንኳን ▁ለ ገንፎ ▁ለሙ ቅም ▁አል ደነግጥ ▁የአማርኛ ▁ምሳሌ ▁ነው። ... (+5 more)` | 15 |
+| 64k | `▁እንኳን ▁ለገንፎ ▁ለሙ ቅም ▁አል ደነግጥ ▁የአማርኛ ▁ምሳሌ ▁ነው። ▁ትርጉሙ ... (+4 more)` | 14 |
+**Sample 3:** `ሞፈር ረገጠ በአማርኛ ፈሊጣዊ አነጋገር የሆነ ዘይቤ ነው። ትርጉም እራሱን ቻለ። ከቤተሰብ ቁጥጥር ውጭ ሆነ። ምሳሌ ደበበ ዕድሜ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁ሞ ፈር ▁ረ ገ ጠ ▁በአማርኛ ▁ፈሊጣዊ ▁አነጋገር ▁የሆነ ▁ዘይቤ ... (+29 more)` | 39 |
+| 16k | `▁ሞ ፈር ▁ረገ ጠ ▁በአማርኛ ▁ፈሊጣዊ ▁አነጋገር ▁የሆነ ▁ዘይቤ ▁ነው። ... (+24 more)` | 34 |
+| 32k | `▁ሞፈር ▁ረገ ጠ ▁በአማርኛ ▁ፈሊጣዊ ▁አነጋገር ▁የሆነ ▁ዘይቤ ▁ነው። ▁ትርጉም ... (+21 more)` | 31 |
+| 64k | `▁ሞፈር ▁ረገ ጠ ▁በአማርኛ ▁ፈሊጣዊ ▁አነጋገር ▁የሆነ ▁ዘይቤ ▁ነው። ▁ትርጉም ... (+21 more)` | 31 |
 ### Key Findings
+- **Best Compression:** 64k achieves 3.293x compression
+- **Lowest UNK Rate:** 8k with 0.1566% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
 |--------|---------|------------|---------|----------------|------------------|-------------------|
+| **2-gram** | Word | 9,101 | 13.15 | 28,185 | 19.6% | 39.5% |
+| **2-gram** | Subword | 2,069 🏆 | 11.01 | 23,787 | 34.1% | 69.3% |
+| **3-gram** | Word | 9,934 | 13.28 | 35,745 | 22.2% | 40.6% |
+| **3-gram** | Subword | 19,035 | 14.22 | 153,217 | 11.9% | 35.6% |
+| **4-gram** | Word | 36,871 | 15.17 | 91,072 | 13.9% | 25.7% |
+| **4-gram** | Subword | 94,475 | 16.53 | 551,504 | 6.6% | 19.5% |
+| **5-gram** | Word | 32,696 | 15.00 | 78,497 | 14.6% | 26.2% |
+| **5-gram** | Subword | 213,435 | 17.70 | 879,311 | 5.0% | 14.3% |
 ### Top 5 N-grams by Size
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `ዓ ም` | 8,266 |
+| 2 | `ምሳሌ ነው` | 5,623 |
+| 3 | `የአማርኛ ምሳሌ` | 5,562 |
+| 4 | `እ ኤ` | 4,014 |
+| 5 | `ኤ አ` | 3,948 |
 **3-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `የአማርኛ ምሳሌ ነው` | 5,562 |
+| 2 | `እ ኤ አ` | 3,896 |
 | 3 | `ምሳሌ ነው ትርጉሙ` | 3,454 |
 | 4 | `መደብ ተረትና ምሳሌ` | 3,051 |
+| 5 | `ነው ትርጉሙ መደብ` | 2,530 |
 **4-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
 | 1 | `የአማርኛ ምሳሌ ነው ትርጉሙ` | 3,452 |
+| 2 | `ምሳሌ ነው ትርጉሙ መደብ` | 2,530 |
+| 3 | `ትርጉሙ መደብ ያልተተረጎመ ምሳሌ` | 2,115 |
+| 4 | `ነው ትርጉሙ መደብ ያልተተረጎመ` | 2,111 |
 | 5 | `ምሳሌ መደብ ተረትና ምሳሌ` | 1,854 |
+**5-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `የአማርኛ ምሳሌ ነው ትርጉሙ መደብ` | 2,529 |
+| 2 | `ምሳሌ ነው ትርጉሙ መደብ ያልተተረጎመ` | 2,111 |
+| 3 | `ነው ትርጉሙ መደብ ያልተተረጎመ ምሳሌ` | 2,111 |
+| 4 | `መደብ ያልተተረጎመ ምሳሌ መደብ ተረትና` | 1,812 |
+| 5 | `ያልተተረጎመ ምሳሌ መደብ ተረትና ምሳሌ` | 1,811 |
 **2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `_ የ` | 172,656 |
+| 2 | `ት _` | 146,889 |
+| 3 | `_ በ` | 142,558 |
+| 4 | `ን _` | 134,273 |
+| 5 | `_ አ` | 115,168 |
 **3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `_ እ ን` | 32,943 |
+| 2 | `_ ነ ው` | 26,886 |
+| 3 | `_ እ ና` | 24,633 |
+| 4 | `ው ። _` | 24,427 |
+| 5 | `እ ና _` | 23,097 |
 **4-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `_ እ ና _` | 22,966 |
+| 2 | `_ ነ ው ።` | 19,603 |
+| 3 | `ነ ው ። _` | 19,130 |
+| 4 | `_ እ ን ደ` | 14,167 |
+| 5 | `_ ላ ይ _` | 13,064 |
+**5-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `_ ነ ው ። _` | 19,000 |
+| 2 | `_ ው ስ ጥ _` | 9,650 |
+| 3 | `ኢ ት ዮ ጵ ያ` | 7,988 |
+| 4 | `_ ም ሳ ሌ _` | 7,852 |
+| 5 | `_ እ ን ደ _` | 6,562 |
 ### Key Findings
+- **Best Perplexity:** 2-gram (subword) with 2,069
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~14% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
 |---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 0.7520 | 1.684 | 4.82 | 237,556 | 24.8% |
+| **1** | Subword | 1.2212 | 2.331 | 17.49 | 2,857 | 0.0% |
+| **2** | Word | 0.1473 | 1.108 | 1.28 | 1,142,374 | 85.3% |
+| **2** | Subword | 1.0395 | 2.055 | 6.98 | 49,956 | 0.0% |
+| **3** | Word | 0.0354 | 1.025 | 1.06 | 1,462,526 | 96.5% |
+| **3** | Subword | 0.6359 | 1.554 | 3.37 | 348,652 | 36.4% |
+| **4** | Word | 0.0157 🏆 | 1.011 | 1.02 | 1,537,232 | 98.4% |
+| **4** | Subword | 0.4526 | 1.368 | 2.15 | 1,173,222 | 54.7% |
 ### Generated Text Samples (Word-based)
 **Context Size 1:**
+1. `ነው ወደዚህም የተሳበው ተፈጥሮን ፀባዮች የሚመረኮዝ ፆታዊ ውሳኔ ተላልፏል ውድድሩን በ ኤርት��ውያን የኩራት ምንጭ የበለጠ የተገደበ`
+2. `እና ሳይንሶችን እንዲሁም ከላስታና ከላሊበላ ከፍተኛ የመብት ጥሰቶች እ ኤ አ የተካሄደውን መፈንቅለ መንግሥት በጌሤም የአሦርም`
+3. `ላይ እንዲገኝ ስለሚያስገድድ ነው ኬንያ ወደሚገኘው ማይ ጎጋ የተባለ የህንድ ጥቃቶች የተጠበቀ እና በችግር ጊዜ የተረጋገጠ`
 **Context Size 2:**
+1. `ዓ ም የዊስቡር ልጅ ዶማር 300 307 ዓ ም ተከለከለ ታጂኪስታን ዓ ም በነሐሴ ወር 450 ዓ`
+2. `ምሳሌ ነው ጦጣ መጀመሪያ የመቀመጫዬን አለች አሉ ጦጣ ባለቤቱን ታስወጣ የአማርኛ ምሳሌ ነው ትርጉሙ መደብ ያልተተረጎመ ምሳሌ`
+3. `የአማርኛ ምሳሌ ነው ለላሙ መንጃ ለሸማው መቅደጃ የአማርኛ ምሳሌ ነው ትርጉሙ መደብ ተረትና ምሳሌ መደብ ተረትና ምሳሌ`
 **Context Size 3:**
+1. `የአማርኛ ምሳሌ ነው ትርጉሙ መደብ ያልተተረጎመ ምሳሌ መደብ ተረትና ምሳሌ wiz`
+2. `እ ኤ አ ቦራስ ስዊድን የግሪክ ዘፋኝ ነች አልበሞች protereotita my number one iparhi logos the game of`
+3. `ምሳሌ ነው ትርጉሙ መደብ ያልተተረጎመ ምሳሌ መደብ ተረትና ምሳሌ መደብ ተረትና ምሳሌ መደብ ተረትና ምሳሌ ምሳሌ`
 **Context Size 4:**
+1. `የአማርኛ ምሳሌ ነው ትርጉሙ መደብ ያልተተረጎመ ምሳሌ መደብ ተረትና ምሳሌ wiz`
+2. `ምሳሌ ነው ትርጉሙ መደብ ተረትና ምሳሌ በሬ ካራጁ ይዉላል`
+3. `ነው ትርጉሙ መደብ ያልተተረጎመ ምሳሌ መደብ ተረትና ምሳሌ መደብ ያልተተረጎመ ምሳሌ`
 ### Generated Text Samples (Subword-based)
 **Context Size 1:**
+1. `_493_የለቀድ_አት_በተገ`
+2. `ንዳዎች_20_የሳት_ወቀን_`
+3. `ት_ገኙ_ነበትላን_ጆች_በዚ`
 **Context Size 2:**
+1. `_የጠፈ_እጅጉ_ሙከራ_ተፈጥሮ`
+2. `ት_ከተማ_እቃ_ለማብራዶሮ_ሶ`
+3. `_በሁለተቸት_ስለ_ተመሳር_ከ`
 **Context Size 3:**
+1. `_እንዲሁም_ዘር።_ከነዚህ_ጊዜ`
+2. `_ነው_፡፡_አየሩ_በኋላም_ብዙ`
+3. `_እና_ጁላይ_ጥይቶቹ_ላይ_(2`
 **Context Size 4:**
+1. `_እና_የከተማ፡-_ጎንደርና_አገ`
+2. `_ነው።_ባብዛኛው_ህይወት_ውስጥ`
+3. `ነው።_ዓ.ም_ኪዮሺ_ሱጊዩራ_(1`
 ### Key Findings
 - **Best Predictability:** Context-4 (word) with 98.4% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (1,173,222 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 100,186 |
+| Total Tokens | 1,652,256 |
+| Mean Frequency | 16.49 |
 | Median Frequency | 3 |
+| Frequency Std Dev | 176.36 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | ነው | 26,831 |
+| 2 | እና | 23,089 |
+| 3 | ላይ | 13,382 |
+| 4 | ምሳሌ | 11,608 |
+| 5 | ውስጥ | 9,891 |
+| 6 | ነበር | 9,130 |
+| 7 | ዓ | 8,627 |
+| 8 | ወደ | 8,565 |
+| 9 | ም | 8,525 |
+| 10 | እንደ | 6,906 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | ጂኒካ | 2 |
+| 2 | ዲኒካላ | 2 |
+| 3 | ወስደሽ | 2 |
+| 4 | አንኳኳ | 2 |
+| 5 | መዳልወ | 2 |
+| 6 | ረድእ | 2 |
+| 7 | አንደኛይቱ | 2 |
+| 8 | ወደሰልፍ | 2 |
+| 9 | የኒኮፖሊስ | 2 |
+| 10 | ጂምናዚየም | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 0.9364 |
+| R² (Goodness of Fit) | 0.995158 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 - **Zipf Compliance:** R²=0.9952 indicates excellent adherence to Zipf's law
 - **High Frequency Dominance:** Top 100 words cover 22.7% of corpus
+- **Long Tail:** 90,186 words needed for remaining 25.1% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ### 5.1 Cross-Lingual Alignment
+![Alignment Quality](visualizations/embedding_alignment_quality.png)
+![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
 ### 5.2 Model Comparison
 | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
 |-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.9080 | 0.3255 | N/A | N/A |
+| **mono_64d** | 64 | 0.9137 | 0.2344 | N/A | N/A |
+| **mono_128d** | 128 | 0.8453 | 0.1726 | N/A | N/A |
+| **aligned_32d** | 32 | 0.9080 | 0.3232 | 0.0220 | 0.1700 |
+| **aligned_64d** | 64 | 0.9137 🏆 | 0.2323 | 0.0420 | 0.1840 |
+| **aligned_128d** | 128 | 0.8453 | 0.1725 | 0.0680 | 0.2480 |
 ### Key Findings
+- **Best Isotropy:** aligned_64d with 0.9137 (more uniform distribution)
+- **Semantic Density:** Average pairwise similarity of 0.2434. Lower values indicate better semantic separation.
+- **Alignment Quality:** Aligned models achieve up to 6.8% R@1 in cross-lingual retrieval.
 - **Recommendation:** 128d aligned for best cross-lingual performance
 ---
 ## 6.  Morphological Analysis (Experimental)
 This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
 ### 6.1 Productivity & Complexity
 | Metric | Value | Interpretation | Recommendation |
 |--------|-------|----------------|----------------|
+| Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
+| Idiomaticity Gap | **0.840** | High formulaic/idiomatic content | - |
 ### 6.2 Affix Inventory (Productive Units)
 | Stem | Cohesion | Substitutability | Examples |
 |------|----------|------------------|----------|
+| `እንደሚ` | 2.39x | 158 contexts | እንደሚሉ, እንደሚሻ, እንደሚል |
+| `ርስቲያ` | 2.46x | 61 contexts | ክርስቲያ, ክርስቲያኗ, ክርስቲያኑ |
+| `ትዮጵያ` | 2.23x | 57 contexts | ኢትዮጵያ, እትዮጵያ, ኢትዮጵያና |
+| `መንግስ` | 2.21x | 49 contexts | መንግስቱ, መንግስት, መንግስተ |
+| `ግዚአብ` | 2.66x | 23 contexts | እግዚአብሔር, እግዚአብሐር, እግዚአብሄር |
+| `ኢትዮጵ` | 2.18x | 46 contexts | ኢትዮጵያ, ኢትዮጵያና, የኢትዮጵያ |
+| `እንግሊ` | 2.08x | 52 contexts | እንግሊኛ, እንግሊዙ, እንግሊዝ |
+| `መንግሥ` | 2.12x | 46 contexts | መንግሥት, መንግሥተ, መንግሥቱ |
+| `ጀመሪያ` | 2.29x | 33 contexts | መጀመሪያ, በመጀመሪያ, ለመጀመሪያ |
+| `ፈረንሳ` | 2.27x | 34 contexts | ፈረንሳይ, ፈረንሳዊ, የፈረንሳዩ |
+| `tion` | 2.77x | 17 contexts | nation, action, section |
+| `መጀመሪ` | 2.29x | 31 contexts | መጀመሪአ, መጀመሪያ, የመጀመሪ |
 ### 6.4 Affix Compatibility (Co-occurrence)
 ### 6.6 Linguistic Interpretation
 > **Automated Insight:**
+The language Amharic shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
+> **Note on Idiomaticity:** The high Idiomaticity Gap suggests a large number of frequent multi-word expressions or formulaic sequences that are statistically distinct from their component parts.
 ---
 ## 7. Summary & Recommendations
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
 | Tokenizer | **64k BPE** | Best compression (3.29x) |
+| N-gram | **2-gram** | Lowest perplexity (2,069) |
 | Markov | **Context-4** | Highest predictability (98.4%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-03 14:11:24*

models/embeddings/aligned/am_128d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3964ee0c4f9ca092d9f74907f56f9a3d93b19752347882a64e552d576a095e2b
+size 1064306440

models/embeddings/aligned/am_128d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "am", "dim": 128, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/am_128d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfbe39d8562cf38339d4ba377db708ff89f8f76337965c05da5c47a5511cd90d
+size 65664

models/embeddings/aligned/am_128d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "am",
+  "dimension": 128,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 2411,
+  "vocab_size": 38514
+}

models/embeddings/aligned/am_32d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43acd2454c73ec6a90e9b75922f4a4ecc9db024320951b2c88c136aaaf3e57dc
+size 266727688

models/embeddings/aligned/am_32d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "am", "dim": 32, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/am_32d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3d0d5ccf51dddf65743a5123bfc0ecd1944f4cfce415736d0e147acf3d55f4b
+size 4224

models/embeddings/aligned/am_32d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "am",
+  "dimension": 32,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 2411,
+  "vocab_size": 38514
+}

models/embeddings/aligned/am_64d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad86c60bc61db296fa76aedb6ab90d476fc19f98d61421e743d16270d0805cf5
+size 532587272

models/embeddings/aligned/am_64d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "am", "dim": 64, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/am_64d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56b5628675cc1634a4f9405dd4c0ad1d8ef1da74827604d4d4f4a7c37742850a
+size 16512

models/embeddings/aligned/am_64d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "am",
+  "dimension": 64,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 2411,
+  "vocab_size": 38514
+}

models/embeddings/monolingual/am_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1d4be827f7a31270ecd980640a7032e11c5ed3885d037c1b8cd46df3d9007492
-size 1063990202

 version https://git-lfs.github.com/spec/v1
+oid sha256:3964ee0c4f9ca092d9f74907f56f9a3d93b19752347882a64e552d576a095e2b
+size 1064306440

models/embeddings/monolingual/am_128d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 128
   },
-  "vocab_size": 38213
 }

     "encoding_method": "rope",
     "dim": 128
   },
+  "vocab_size": 38514
 }

models/embeddings/monolingual/am_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1facd0cecd15328df01b8eb00c409adeaac9321a5d62748462e3055c6a4b976e
-size 266642618

 version https://git-lfs.github.com/spec/v1
+oid sha256:43acd2454c73ec6a90e9b75922f4a4ecc9db024320951b2c88c136aaaf3e57dc
+size 266727688

models/embeddings/monolingual/am_32d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 32
   },
-  "vocab_size": 38213
 }

     "encoding_method": "rope",
     "dim": 32
   },
+  "vocab_size": 38514
 }

models/embeddings/monolingual/am_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f42620197f1e1b51b39471cc1e5886ab38c161775048e9529ceb06fdf065e6e1
-size 532425146

 version https://git-lfs.github.com/spec/v1
+oid sha256:ad86c60bc61db296fa76aedb6ab90d476fc19f98d61421e743d16270d0805cf5
+size 532587272

models/embeddings/monolingual/am_64d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 64
   },
-  "vocab_size": 38213
 }

     "encoding_method": "rope",
     "dim": 64
   },
+  "vocab_size": 38514
 }

models/subword_markov/am_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0dea8b9be146dc2d3f0da26601ce42020e0c84af4656d9f7692b3f4d34d5d6ae
-size 356106

 version https://git-lfs.github.com/spec/v1
+oid sha256:8839cf6220dd43e6dea9746686fa8a81a9a14dd02c6132ff81405220cf661652
+size 350051

models/subword_markov/am_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "subword",
   "language": "am",
-  "unique_contexts": 2854,
-  "total_transitions": 8936316
 }

   "context_size": 1,
   "variant": "subword",
   "language": "am",
+  "unique_contexts": 2857,
+  "total_transitions": 9022824
 }

models/subword_markov/am_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0e9b01d5674bb78549bd0f7a41241639ddc7934db689c638aee8d0daacc70172
-size 2154891

 version https://git-lfs.github.com/spec/v1
+oid sha256:27974fda5f8693d4915e9b657526e84bd0fb36fde9d966f19e785d000aedbe5c
+size 2143373

models/subword_markov/am_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "am",
-  "unique_contexts": 49981,
-  "total_transitions": 8923902
 }

   "context_size": 2,
   "variant": "subword",
   "language": "am",
+  "unique_contexts": 49956,
+  "total_transitions": 9010410
 }

models/subword_markov/am_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:36e05d4c86f22f4d3f6c7dff80e4b868513b425317942829e61a871110f71ee0
-size 8408029

 version https://git-lfs.github.com/spec/v1
+oid sha256:82c019b23a13a08dd018f89e36328fb55a539c91738572761bc8164c5041b3f4
+size 8416653

models/subword_markov/am_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "am",
-  "unique_contexts": 348535,
-  "total_transitions": 8911488
 }

   "context_size": 3,
   "variant": "subword",
   "language": "am",
+  "unique_contexts": 348652,
+  "total_transitions": 8997996
 }

models/subword_markov/am_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ab3d345339fb5097d316a6acad43d3cbd7f781c24bd690b17950a9d30bdd7dfa
-size 23278804

 version https://git-lfs.github.com/spec/v1
+oid sha256:f07c64385b4c55e182b6ffee304e1810b2d71ef22b3c5a0b251758b633fca59c
+size 23309934

models/subword_markov/am_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "am",
-  "unique_contexts": 1171344,
-  "total_transitions": 8899074
 }

   "context_size": 4,
   "variant": "subword",
   "language": "am",
+  "unique_contexts": 1173222,
+  "total_transitions": 8985582
 }

models/subword_ngram/am_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6bcf32e20adcf3b9aa77e784d9fd125484c7c7d01f306652cf74a76522fbc536
-size 300324

 version https://git-lfs.github.com/spec/v1
+oid sha256:5eabd2483a9b43b8f8fd572f71957cc349ed926f2f419138f2f3a188ed78c42f
+size 300472

models/subword_ngram/am_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "am",
-  "unique_ngrams": 23804,
-  "total_ngrams": 8936316
 }

   "n": 2,
   "variant": "subword",
   "language": "am",
+  "unique_ngrams": 23787,
+  "total_ngrams": 9022824
 }

models/subword_ngram/am_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9c8cece5e05ace848d6a685201f3e49e33ed838dc7744aec46795f09bbfdb9e0
-size 1881540

 version https://git-lfs.github.com/spec/v1
+oid sha256:3c9b592f96c76cc3101fee1a715cf6c4149dfe51039e391d6098096f9714abe5
+size 1885405

models/subword_ngram/am_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "am",
-  "unique_ngrams": 153027,
-  "total_ngrams": 8923902
 }

   "n": 3,
   "variant": "subword",
   "language": "am",
+  "unique_ngrams": 153217,
+  "total_ngrams": 9010410
 }

models/subword_ngram/am_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f6cf11a9b9599d5b462fc5775302067d643de8e8fd248de5060f89351b18b9ab
-size 7082960

 version https://git-lfs.github.com/spec/v1
+oid sha256:899527071ee5237fc41c2fdb6de104208b69f563db321792d2b61794349f7f99
+size 7120084

models/subword_ngram/am_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "am",
-  "unique_ngrams": 549996,
-  "total_ngrams": 8911488
 }

   "n": 4,
   "variant": "subword",
   "language": "am",
+  "unique_ngrams": 551504,
+  "total_ngrams": 8997996
 }

models/subword_ngram/am_5gram_subword.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6256142f289450d718eafc3693efd234fd5e20489751f38071b275a053df337
+size 12113547

models/subword_ngram/am_5gram_subword_metadata.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "n": 5,
+  "variant": "subword",
+  "language": "am",
+  "unique_ngrams": 879311,
+  "total_ngrams": 8985582
+}

models/tokenizer/am_tokenizer_16k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f02cb4592939e59c4831f57ca855266e8d28172e5efebde508c8b57daefbafb6
-size 559482

 version https://git-lfs.github.com/spec/v1
+oid sha256:802c7e23f92ccb7959b0feb6dc8f82635d55d846dbd9f4570913915ce17d5785
+size 559625

models/tokenizer/am_tokenizer_16k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/am_tokenizer_32k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:93bd53271f8c4e6a918072a97dd0581c020a8af645616093bc8213a30b88940e
-size 902409

 version https://git-lfs.github.com/spec/v1
+oid sha256:15d15bc01b2e176dbce09f1705536a89afa2737570c8c47d3353634f9f68a94a
+size 902568

models/tokenizer/am_tokenizer_32k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/am_tokenizer_64k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b8481661d7e8d938bace980d6a5c315614597c92fb98ccbef70df78797efdecc
-size 1589488

 version https://git-lfs.github.com/spec/v1
+oid sha256:5022a070bcee7d18664184b36241ade2425fae873f2d51e1b98af97932ce4f68
+size 1589838

models/tokenizer/am_tokenizer_64k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/am_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ad931bdd41158b2542d8aac191257d87f612a4ef884c47473efc0d7a86dd011
-size 394754

 version https://git-lfs.github.com/spec/v1
+oid sha256:5a8188f6c50ce22b57642fe6d5b7e098ba217e95117378d4c365656422f23b18
+size 394741

models/tokenizer/am_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/am_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:294dfe907fb7e7f0defd53a349bd532546806ddc295a92c292188997c0d498bd
-size 1787217

 version https://git-lfs.github.com/spec/v1
+oid sha256:fd540986de5c037c2d8004d9247b062277611308cc1b9bc858ffc3394fee94dd
+size 1782777

models/vocabulary/am_vocabulary_metadata.json CHANGED Viewed

@@ -1,17 +1,17 @@
 {
   "language": "am",
-  "vocabulary_size": 99716,
   "variant": "full",
   "statistics": {
-    "type_token_ratio": 0.13335844069738334,
     "coverage": {
-      "top_100": 0.20952339607919193,
-      "top_1000": 0.42309028051841446,
-      "top_5000": 0.6109410976729082,
-      "top_10000": 0.6909696930060957
     },
-    "hapax_count": 136824,
-    "hapax_ratio": 0.5784391646233196,
     "total_documents": 12414
   }
 }

 {
   "language": "am",
+  "vocabulary_size": 100186,
   "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.13283071071151606,
     "coverage": {
+      "top_100": 0.209810304098978,
+      "top_1000": 0.42304666635378463,
+      "top_5000": 0.6110407126558544,
+      "top_10000": 0.6911446565337589
     },
+    "hapax_count": 137556,
+    "hapax_ratio": 0.5785936014671367,
     "total_documents": 12414
   }
 }

models/word_markov/am_markov_ctx1_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:61264cf69517fe843352a0b6cc8383107c7cd20cb9613edb6a901779636cd1bd
-size 13693182

 version https://git-lfs.github.com/spec/v1
+oid sha256:95ccb7912954a141c9b8b7ca95e221a9fccec6dd51f1c444544019f2ac02b83d
+size 13813306

models/word_markov/am_markov_ctx1_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "word",
   "language": "am",
-  "unique_contexts": 236353,
-  "total_transitions": 1761302
 }

   "context_size": 1,
   "variant": "word",
   "language": "am",
+  "unique_contexts": 237556,
+  "total_transitions": 1777398
 }

models/word_markov/am_markov_ctx2_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c565bae97b8cf56a6c0d532f8f790eb6482096f8f6f9ee9069aec8edcf717f1d
-size 29639293

 version https://git-lfs.github.com/spec/v1
+oid sha256:d1434179ed6804e7277902166d67afa9d340c443633dfac8c4f6574077bc3705
+size 30016914

models/word_markov/am_markov_ctx2_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "word",
   "language": "am",
-  "unique_contexts": 1130961,
-  "total_transitions": 1748889
 }

   "context_size": 2,
   "variant": "word",
   "language": "am",
+  "unique_contexts": 1142374,
+  "total_transitions": 1764985
 }