Update README.md
Browse files
README.md
CHANGED
|
@@ -5,8 +5,12 @@ datasets:
|
|
| 5 |
---
|
| 6 |
|
| 7 |
# Tebyan تبيـان
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
## Model description
|
| 12 |
|
|
@@ -25,7 +29,7 @@ Person, Location, Organization, Nationality, Job, Product, Event, Time, Art-Work
|
|
| 25 |
|
| 26 |
Install transformers AND nltk (python >= 3.6)
|
| 27 |
|
| 28 |
-
`$ pip3 install transformers==4.
|
| 29 |
|
| 30 |
> If you are using `Google Colab`, please restart your runtime after installing the packages.
|
| 31 |
|
|
@@ -33,6 +37,7 @@ Install transformers AND nltk (python >= 3.6)
|
|
| 33 |
|
| 34 |
```python
|
| 35 |
# we need to install NLTK punkt to be used for word tokenization
|
|
|
|
| 36 |
from collections import defaultdict
|
| 37 |
import nltk
|
| 38 |
nltk.download('punkt')
|
|
@@ -41,6 +46,9 @@ from nltk.tokenize import word_tokenize
|
|
| 41 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
| 42 |
from transformers import pipeline
|
| 43 |
|
|
|
|
|
|
|
|
|
|
| 44 |
# ===== import the model
|
| 45 |
m_name = "marefa-nlp/marefa-ner"
|
| 46 |
tokenizer = AutoTokenizer.from_pretrained(m_name)
|
|
@@ -50,49 +58,39 @@ model = AutoModelForTokenClassification.from_pretrained(m_name)
|
|
| 50 |
nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
|
| 51 |
|
| 52 |
# ===== extract the entities from a sample text
|
| 53 |
-
example = '
|
| 54 |
# clean the text
|
| 55 |
example = " ".join(word_tokenize(example))
|
| 56 |
# feed to the NER model to parse
|
| 57 |
ner_results = nlp(example)
|
| 58 |
|
| 59 |
-
#
|
| 60 |
-
# we prepared a simple fixing code to generate full entities tokens
|
| 61 |
|
| 62 |
-
|
| 63 |
-
fixed_ner_results = []
|
| 64 |
for ent in ner_results:
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
fixed_ner_results.append(current_ent)
|
| 84 |
-
|
| 85 |
-
# sort entities
|
| 86 |
-
fixed_ner_results = sorted(fixed_ner_results, key=lambda e: e['start'], reverse=False)
|
| 87 |
-
|
| 88 |
-
# ===== print the ner_results
|
| 89 |
-
for ent in fixed_ner_results:
|
| 90 |
-
print(ent["word"], '->' ,ent['entity_group'], " # score:", "%.2f" % ent['score'])
|
| 91 |
|
| 92 |
#####
|
| 93 |
-
#
|
| 94 |
-
#
|
| 95 |
-
# الجيش
|
|
|
|
| 96 |
####
|
| 97 |
|
| 98 |
```
|
|
@@ -103,6 +101,7 @@ for ent in fixed_ner_results:
|
|
| 103 |
|
| 104 |
- على سيد عبد الحفيظ - إشراف
|
| 105 |
- نرمين محمد عطيه
|
|
|
|
| 106 |
- احمد علي عبدربه
|
| 107 |
- عمر بن عبد العزيز سليمان
|
| 108 |
- محمد ابراهيم الجمال
|
|
|
|
| 5 |
---
|
| 6 |
|
| 7 |
# Tebyan تبيـان
|
| 8 |
+
## Marefa Arabic Named Entity Recognition Model
|
| 9 |
+
## نموذج المعرفة لتصنيف أجزاء النص
|
| 10 |
+
---------
|
| 11 |
+
**Version**: 1.0.1
|
| 12 |
+
|
| 13 |
+
**Last Update:** 16-05-2021
|
| 14 |
|
| 15 |
## Model description
|
| 16 |
|
|
|
|
| 29 |
|
| 30 |
Install transformers AND nltk (python >= 3.6)
|
| 31 |
|
| 32 |
+
`$ pip3 install transformers==4.6.0 nltk==3.5 protobuf==3.15.3 torch==1.7.1`
|
| 33 |
|
| 34 |
> If you are using `Google Colab`, please restart your runtime after installing the packages.
|
| 35 |
|
|
|
|
| 37 |
|
| 38 |
```python
|
| 39 |
# we need to install NLTK punkt to be used for word tokenization
|
| 40 |
+
# we need to install NLTK punkt to be used for word tokenization
|
| 41 |
from collections import defaultdict
|
| 42 |
import nltk
|
| 43 |
nltk.download('punkt')
|
|
|
|
| 46 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
| 47 |
from transformers import pipeline
|
| 48 |
|
| 49 |
+
# labels list
|
| 50 |
+
labels_list = ['O', 'B-nationality', 'B-event', 'B-person', 'B-artwork', 'B-location', 'B-product', 'B-organization', 'B-job', 'B-time', 'I-nationality', 'I-event', 'I-person', 'I-artwork', 'I-location', 'I-product', 'I-organization', 'I-job', 'I-time']
|
| 51 |
+
|
| 52 |
# ===== import the model
|
| 53 |
m_name = "marefa-nlp/marefa-ner"
|
| 54 |
tokenizer = AutoTokenizer.from_pretrained(m_name)
|
|
|
|
| 58 |
nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
|
| 59 |
|
| 60 |
# ===== extract the entities from a sample text
|
| 61 |
+
example = 'خاضت القوات المصرية حرب السادس من أكتوبر ضد الجيش الصهيوني عام 1973'
|
| 62 |
# clean the text
|
| 63 |
example = " ".join(word_tokenize(example))
|
| 64 |
# feed to the NER model to parse
|
| 65 |
ner_results = nlp(example)
|
| 66 |
|
| 67 |
+
# we prepared a simple code to generate full entities tokens
|
|
|
|
| 68 |
|
| 69 |
+
modified_results = []
|
|
|
|
| 70 |
for ent in ner_results:
|
| 71 |
+
ent["entity_group"] = int(ent["entity_group"].lower().replace("label_",""))
|
| 72 |
+
ent["entity_group"] = labels_list[ent["entity_group"]]
|
| 73 |
+
|
| 74 |
+
if ent["entity_group"] != "O":
|
| 75 |
+
if "B-" in ent["entity_group"]:
|
| 76 |
+
ent["entity_group"] = ent["entity_group"].replace("B-","")
|
| 77 |
+
modified_results.append(ent)
|
| 78 |
+
elif "I-" in ent["entity_group"]:
|
| 79 |
+
## check related entity-group
|
| 80 |
+
label = ent["entity_group"].replace("I-","")
|
| 81 |
+
if len(modified_results) > 0 and label == modified_results[-1]["entity_group"]:
|
| 82 |
+
modified_results[-1]["word"] += f" {ent['word']}"
|
| 83 |
+
modified_results[-1]["score"] = sum([modified_results[-1]["score"], ent["score"]])/2
|
| 84 |
+
modified_results[-1]["end"] = ent["end"]
|
| 85 |
+
|
| 86 |
+
for res in modified_results:
|
| 87 |
+
print(res["word"], "==>", res["entity_group"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
#####
|
| 90 |
+
# القوات المصرية ==> organization
|
| 91 |
+
# حرب السادس من أكتوبر ==> event
|
| 92 |
+
# الجيش الصهيوني ==> organization
|
| 93 |
+
# عام 1973 ==> time
|
| 94 |
####
|
| 95 |
|
| 96 |
```
|
|
|
|
| 101 |
|
| 102 |
- على سيد عبد الحفيظ - إشراف
|
| 103 |
- نرمين محمد عطيه
|
| 104 |
+
- صلاح خيرالله
|
| 105 |
- احمد علي عبدربه
|
| 106 |
- عمر بن عبد العزيز سليمان
|
| 107 |
- محمد ابراهيم الجمال
|