Spaces:
Sleeping
Sleeping
| from transformers import pipeline | |
| import requests | |
| import json | |
| import gradio as gr | |
| js = """ | |
| async () => { | |
| function showCard(event, title, content) { | |
| document.getElementById('hovercard').style.visibility = 'visible'; | |
| document.getElementById('card_title').innerText = title; | |
| document.getElementById('card_content').innerText = content; | |
| } | |
| function hideCard(event) { | |
| document.getElementById('hovercard').style.visibility = 'hidden'; | |
| } | |
| globalThis.showCard = showCard; | |
| globalThis.hideCard = hideCard; | |
| } | |
| """ | |
| def get_matches(text): | |
| pred = pipe(text, max_length=5000)[0]["translation_text"] | |
| def get_mapping(pred): | |
| pred = pred.split(" = ") | |
| pred = [x.split("+") for x in pred] | |
| flat = [x for y in pred for x in y] | |
| flat = [x.split(":") for x in flat] | |
| return flat | |
| mapping = get_mapping(pred) | |
| # only keep tuples with length 2 | |
| mapping = [x for x in mapping if len(x) == 2] | |
| matches = [] | |
| cur = mapping.pop(0) | |
| i = 0 | |
| done = False | |
| while i < len(text) and not done: | |
| if text[i:].startswith(cur[0]): | |
| matches.append({"start": i, "end": i+len(cur[0]), "match": cur[0], "lexicon": cur[1]}) | |
| i += len(cur[0]) | |
| if len(mapping) == 0: | |
| done = True | |
| else: | |
| cur = mapping.pop(0) | |
| else: | |
| i += 1 | |
| return (text, pred, matches) | |
| pipe = pipeline("translation", "guymorlan/TokenizerLabeller") | |
| r = requests.get("https://huggingface.co/guymorlan/TokenizerLabeller/raw/main/playaling_words.json") | |
| data = json.loads(r.text) | |
| def predict(input): | |
| text, pred, matches = get_matches(input) | |
| matches = {x["start"]: x for x in matches} | |
| output = f""" | |
| <div style='direction: rtl; text-align: right; font-size: 18px; font-family: Arial, sans-serif; line-height: 1.5'>""" | |
| i = 0 | |
| while i < len(text): | |
| if i in matches: | |
| match = matches[i]["lexicon"] | |
| # if match ends with _R, remove _R suffix | |
| if match.endswith("_R"): | |
| match = match[:-2] | |
| if match in data: | |
| output += f""" | |
| <span style='background-color: #4CAF50; color: #FFFFFF; border: 1px solid #4CAF50; border-radius: 5px; font-family: "Courier New", Courier, monospace;' | |
| onmouseover='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")' | |
| onmouseout='hideCard(event)' onclick='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'>{matches[i]['match']}</span> | |
| """ | |
| else: | |
| output += matches[i]["match"] | |
| i = matches[i]["end"] | |
| else: | |
| if text[i] == " ": | |
| output += " " | |
| else: | |
| output += text[i] | |
| i += 1 | |
| output += "</div>" | |
| output += """ | |
| <div id='hovercard' style='position: absolute; visibility: hidden; background: #FFFFFF; padding: 10px; | |
| border: 1px solid #9E9E9E; border-radius: 5px; font-family: Arial, sans-serif;'> | |
| <h3 id='card_title' style='color: #000000;'></h3> | |
| <p id='card_content' style='color: #000000;'></p> | |
| </div> | |
| """ | |
| return output | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Ammiya Tokenizer and Labeler", js = js) as demo: | |
| gr.HTML("<h2><span style='color: #2563eb'>Colloquial Arabic</span></h2> Tokenizer and Annotator") | |
| with gr.Row(): | |
| with gr.Column(): | |
| input = gr.Textbox(label="Input", placeholder="Enter Arabic Text", lines=1) | |
| gr.Examples(examples=["بديش اروح معك", "معملتش اشي"], inputs=input) | |
| btn = gr.Button("Analyze") | |
| with gr.Column(): | |
| html = gr.HTML() | |
| btn.click(predict, inputs=[input], outputs=[html]) | |
| input.submit(predict, inputs=[input], outputs=[html]) | |
| demo.load() | |
| if __name__ == "__main__": | |
| demo.launch() |