TEXT SUMMARIZATION
Un altro elemento distintivo del Neural Language Processing è il text summarize che ci permette di avere un sommario di un dato un testo. L'algoritmo è della famiglia Bert (Bidirectional Encoder Representations from Transformers) qui si presenta un esempio di creato con un modello pretrained di nome BART.
L'applicazione verrà creata con una interfaccia web tramite Ploty e Dash che ci permette di creare grafici e/o visualizzazioni delle applicazioni di Machine Learning.
%%sh
pip install transformers
pip install dash
pip install dash_bootstrap_components
pip install -q dash_core_components
pip install -q dash_html_components
pip install -q dash_table
%%sh
# get ngrok
curl -O https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
unzip ngrok-stable-linux-amd64.zip
get_ipython().system_raw('./ngrok http 8050 &')
# lancia ngrok - per accedere alla web application
%%sh
# get url with ngrok ### click sul link creato per aprire web application
curl -s http://localhost:4040/api/tunnels | python3 -c "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"
import time
import dash
import dash_html_components as html
import dash_core_components as dcc
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output, State

from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
import torch
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device == "cuda":
model = model.half()
model.to(device)
model.eval()
# Define app
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
server = app.server

controls = dbc.Card(
[
dbc.FormGroup(
[
dbc.Label("Max Length Risultato"),
dcc.Slider(
id="max_length",
min=10,
max=100,
value=30,
marks={i: str(i) for i in range(10, 101, 10)},
),
]
),
dbc.FormGroup(
[
dbc.Label("Beam Size"),
dcc.Slider(
id="num_beams",
min=1,
max=6,
value=4,
marks={i: str(i) for i in [1,2,3, 4,5, 6]},
),
]
),
dbc.FormGroup(
[
dbc.Spinner(
[
dbc.Button("Summarize", id="button-run"),
html.Br(),
html.Br(),
dbc.Label("Tempo"),
html.Div( id="time_taken",
style={"margin": "20px"}),
]
)
]
),
],
body=True,
style={"height": "280px"},
)
# Define Layout
app.layout = dbc.Container(
fluid=True,
children=[
html.H1(children='Summarization With BART MecBar.com',
style={
'textAlign': 'center',
'color': 'white' ,
'background-color' : 'black' } ),
html.Hr(),
dbc.Row(
[
dbc.Col( width=5, children=[ controls, dbc.Card(
body=True, children=[
dbc.FormGroup(
[dbc.Label("Risultato"), dcc.Textarea(
id="risultato",
style={
"width": "100%",
"height": "calc(75vh - 275px)",
},), ] ) ], ), ], ),
dbc.Col(
width=7,
children=[
dbc.Card(
body=True,
children=[
dbc.FormGroup(
[
dbc.Label("Inserisci il testo"),
dcc.Textarea(
id="testo",
style={"width": "100%", "height": "75vh"},
),
]
)
],
)
],
),
]
),
],
)
@app.callback(
[Output("risultato", "value"), Output("time_taken", "children")],
[
Input("button-run", "n_clicks"),
Input("max_length", "value"),
Input("num_beams", "value"),
],
[State("testo", "value")],
)


def summarize( n_clicks, max_length, num_beams , testo): # ordine argomenti da app.callback
if testo is None or testo == "":
return "", "0:0"

inizio = time.time()
input_ids = tokenizer.batch_encode_plus([testo], return_tensors='pt', max_length=max_length_art)['input_ids'].to(device)
summary_ids = model.generate(input_ids,
num_beams=num_beams,
length_penalty=2.5,
max_length=1024,
min_length=20,
no_repeat_ngram_size=3)

sentence = [tokenizer.decode(summ, skip_special_tokens=True, clean_up_tokenization_spaces=False) for summ in summary_ids]
fine = time.time()
time = f"Elaborato con {device} in {fine-inizio:.2f}s"
sentence = ' '.join(sentence)
return sentence , time



app.run_server()