from IPython.display import HTML, Markdown
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<input type="button" value="Clicca per vedere/nascondere il codice Python" onclick="code_toggle()">''')
import warnings
#warnings.filterwarnings('once')
warnings.filterwarnings('ignore')
%matplotlib inline
import pandas as pd
import urllib.request
from IPython.display import display, Markdown
import matplotlib.pyplot as plt
pd.options.display.max_rows = None
plt.style.use('ggplot')
miur = pd.read_csv('bebras_school_list.zip', low_memory=False)
def norm_region(r):
"""Normalize the name of a region. It also corrects wrong names."""
r = r.strip().upper()
if r == 'FVG' or r.startswith('FRIULI'):
return 'FRIULI-VENEZIA GIULIA'
if r.startswith('EMILIA'):
return 'EMILIA-ROMAGNA'
if r.startswith('TRENTINO') or r.startswith('ALTO ADIGE'):
return 'TRENTINO-ALTO ADIGE'
if r.startswith('LOMB'):
return 'LOMBARDIA'
if r.startswith('VALLE'):
return "VALLE D'AOSTA"
if r == 'G6GG6Y' or r == 'ITALIA':
return None
elif r == 'ALBANIA' or r == 'BAVIERA' or r == 'SIERRA' or r == 'DDDD' or r == 'FRANCE':
return 'ESTERO'
else:
return r
def infer_school_type(k):
knorm = k['school_kind'].strip().upper()
cnorm = k['school_code'].strip().upper()
if cnorm and miur[miur['i_code'] == cnorm]['i_type'].count() > 0:
knorm = str(miur[miur['i_code'] == cnorm]['i_type'].iloc[0])
if 'PRIMARIA' in knorm or 'INFANZIA' in knorm or 'ELEMENTARE' in knorm:
return 'E'
if 'PRIMO GRADO' in knorm or ('MEDIA' in knorm and (not 'SUP' in knorm))\
or '1°' in knorm or ' I GRADO' in knorm or knorm == 'IC':
return 'M'
if 'COMPRENSIVO' in knorm:
return 'EM'
if 'SECONDO GRADO' in knorm or '2°' in knorm or 'II GRADO' in knorm \
or 'LICEO' in knorm or 'ITI' in knorm or 'PROF' in knorm or 'IST TEC' in knorm \
or 'TECNICO' in knorm or 'MAGISTRALE' in knorm or 'SUPERIORE' in knorm:
return 'S'
if knorm == 'STATALE' or 'C.D.38':
return 'EMS'
else:
return knorm
with open('secret.key') as k:
key = k.readline().strip()
r = urllib.request.urlopen(("https://bebras.it/api?key={}&view=teachers_edition"+
"&edition=bebras_2018&subscription=1").format(key))
with open("teachers.json", "w") as tw:
tw.writelines(r.read().decode('utf-8'))
teachers = pd.DataFrame(pd.read_json("teachers.json", convert_axes=True))[3:]
teachers.index = range(len(teachers))
teachers['confirm_time'] = pd.to_datetime(teachers['confirm_time'], unit='s')
teachers['enter_time'] = pd.to_datetime(teachers['enter_time'], unit='s')
teachers['school_code'] = teachers['school_code'].str.strip().str.upper()
teachers['school_type'] = teachers[['school_kind','school_code']].apply(infer_school_type, axis=1)
filled = len(teachers)
expected = teachers.sum()
regteams = expected['teams_active']
today = pd.datetime.today()
if today > pd.datetime(2018,11,5):
today = pd.datetime(2018,11,6)
s = """*{}:* **{:d}** insegnanti hanno confermato la partecipazione;
ci sono **{:d}** squadre già registrate (~*{:d}* alunni).
"""
display(Markdown(s.format(str(today)[:19],
filled, regteams, regteams*4)))
if today <= pd.datetime(2018,11,5):
isotoday = today.isoformat()[:10]
with open("stats-" + isotoday + ".txt", "w") as stat:
stat.write("{:d} {:d} {:d}\n".format(filled, regteams, regteams*4))
oldteachers = {}
for y in [2015, 2016, 2017]:
r = urllib.request.urlopen(("https://bebras.it/api?key={}&view=teachers_edition"+
"&edition={}").format(key, y))
with open("teachers{}.json".format(y), "w") as tw:
tw.writelines(r.read().decode('utf-8'))
oldteachers[y] = pd.DataFrame(pd.read_json("teachers{}.json".format(y), convert_axes=True))[3:]
#oldtteachers[y]['school_type'] = oldteachers[['school_kind','school_code']].apply(infer_school_type, axis=1)
intersect = {}
for y in [2015, 2016, 2017]:
intersect[y] = pd.merge(teachers, oldteachers[y], on='id', how='inner')
intersect[y]['deltateams'] = intersect[y]['teams_active_x'] - intersect[y]['teams_active_y']
returning = intersect[y]['id'].count()
base = len(oldteachers[y][oldteachers[y]['teams_active'] > 0])
s = """*{:d}* insegnanti hanno già partecipato all'edizione {:d} (**{:.0f}%** dei partecipanti di quell'edizione),
il numero di squadre è aumentato in media di {:.1f} (deviazione standard {:.0f}).
"""
display(Markdown(s.format(returning, y,
100*float(returning)/float(base),
intersect[y]['deltateams'].mean(), intersect[y]['deltateams'].std()
)))
trintersec = pd.merge(intersect[2017], oldteachers[2016], on='id', how='inner')
print("Hanno partecipato nel 2016, 2017 e 2018: {}".format(len(trintersec)))
print("Hanno partecipato nel 2015, 2017 e 2018: {}".format(
len(pd.merge(intersect[2017], oldteachers[2015], on='id', how='inner'))))
print("Hanno partecipato nel 2015, 2016 e 2018: {}".format(
len(pd.merge(intersect[2016], oldteachers[2015], on='id', how='inner'))))
print("Hanno partecipato nel 2015, 2016, 2017 e 2018: {}".format(
len(pd.merge(trintersec, oldteachers[2015], on='id', how='inner'))))
institutes = teachers[(teachers['school_code'].str.strip() != "")
& (teachers['subscription'] > 0)
& (teachers['confirm_time'] > pd.datetime(2018,9,1))].groupby('school_code')['id'].count()
print("Totale istituti con codice meccanografico: {}; numero medio insegnanti per codice: {:.2f}".format(len(institutes), institutes.mean()))
import os
data = []
for path, dirs, files in os.walk("."):
for f in files:
if path == '.' and f.startswith("stats-"):
d = [int(x) for x in f.split('.')[0].split('-')[1:4]]
with open(f,"r") as df:
nn = [int(x) for x in df.readline().strip().split(" ")]
dd = pd.datetime(2018, 11, 12) - pd.datetime.fromtimestamp(os.stat(f).st_mtime)
data.append((dd, nn))
data = pd.DataFrame.from_dict(dict(data), orient="index",
columns=["insegnanti","squadre","alunni"]).sort_index(ascending=False)
data['giorni'] = (data.index * -1).astype('timedelta64[D]')
olddata = []
for path, dirs, files in os.walk("old"):
for f in files:
if f.startswith("stats-"):
d = [int(x) for x in f.split('.')[0].split('-')[1:4]]
with open(path + "/" + f,"r") as df:
nn = [int(x) for x in df.readline().strip().split(" ")]
olddata.append((pd.datetime(2017,11,13) - pd.datetime(*d), nn))
olddata = pd.DataFrame.from_dict(dict(olddata), orient="index",
columns=["insegnanti","squadre","alunni"]).sort_index(ascending=False)
olddata['giorni'] = (olddata.index * -1).astype('timedelta64[D]')
fig, ax = plt.subplots(1,2)
fig.set_size_inches(11,5)
for i, t in enumerate(['squadre', 'insegnanti']):
ax[i].plot([-d.days for d in data.index], list(data[t]), label=t + ' 2018')
ax[i].plot([-d.days for d in olddata.index], list(olddata[t]), '--', label=t + ' 2017')
ax[i].legend()
delta = (data[t].max()-olddata[t].max())/olddata[t].max()
ax[i].text(-.7*data[t].count(), .9*data[t].max(), '{:+.1f}%'.format(delta*100), color='tomato')
plt.show()
Dati ISTAT della popolazione studentesca scuola primaria e secondaria nel 2014 (fonte: http://dati.istat.it)
istat = pd.DataFrame.from_dict(
dict([
("PIEMONTE", (191399, 117997, 168439)),
("VALLE D'AOSTA", ( 5981, 3691, 5309)),
("LIGURIA", ( 61566, 39213, 60184)),
("LOMBARDIA", (468662, 283007, 381619)),
("TRENTINO-ALTO ADIGE", ( 27028, 16890, 21836)),
("VENETO", (232694, 142401, 204262)),
("FRIULI-VENEZIA GIULIA", ( 51830, 32143, 46949)),
("EMILIA-ROMAGNA", (198417, 118460, 176968)),
("TOSCANA", (161001, 98203, 152886)),
("UMBRIA", ( 39181, 23488, 36946)),
("MARCHE", ( 67996, 42095, 70602)),
("LAZIO", (268133, 161573, 249145)),
("ABRUZZO", ( 57146, 35828, 58578)),
("MOLISE", ( 12595, 8354, 14990)),
("CAMPANIA", (317346, 204223, 326644)),
("PUGLIA", (198662, 130675, 213545)),
("BASILICATA", (25237, 17097, 30214)),
("CALABRIA", (93277, 59624, 101208)),
("SICILIA", (254023, 164520, 252730)),
("SARDEGNA", (67379, 44105, 74003)),
("ESTERO", (pd.np.NaN, pd.np.NaN, pd.np.NaN))
]),
orient = "index",
columns = ('Primaria','Secondaria I grado','Secondaria II grado'))
istat['totale'] = istat['Primaria'] + istat['Secondaria I grado'] + istat['Secondaria II grado']
with pd.option_context('display.float_format', '{:.0f}'.format):
display(istat)
CATS = ('kilo', 'mega', 'giga', 'tera', 'peta')
snames = {'E': 'Primaria', 'M': 'Secondaria I grado', 'S': 'Secondaria II grado'}
for i, k in enumerate(CATS):
if not os.path.exists("overview-{}.json".format(k)):
r = urllib.request.urlopen("https://bebras.it/api?key={}&view=exams&test={}&examdata=0&edition=bebras_2018&events=0".format(key,71+i))
with open("overview-{}.json".format(k), "w") as tw:
tw.writelines(r.read().decode('utf-8'))
import json
overview = []
for k in CATS:
with open("overview-{}.json".format(k), "r") as t:
j = json.load(t)
overview += j['exams']
dfov = pd.DataFrame(overview)
gare = pd.DataFrame()
gare['categoria'] = dfov['category'].str.lower().astype(pd.api.types.CategoricalDtype(categories = CATS, ordered=True))
gare['insegnante'] = dfov['teacher_id'].astype('int64')
gare['login'] = dfov['login']
gare['status'] = dfov['exam_valid_score']
gare['risultato'] = dfov['score']
gare['data'] = pd.to_datetime(dfov['time'])
gare['studenti'] = dfov['team_composition'].map(lambda tt: 0 if type(tt) != type({}) else len([s for s in tt['members'] if s['name'] != '' ]))
fid = teachers.set_index('id')
fid['regione'] = fid['school_region'].map(norm_region)
gare = gare.join(fid[['regione']],on='insegnante')
done = gare[gare['status'] == 1]
len(done.groupby(['insegnante']))
display(done.groupby(['regione'])['insegnante'].nunique())
display(done.groupby(['categoria'])['insegnante'].nunique())
with pd.option_context('display.float_format', '{:.0f}'.format):
display(done.groupby(['regione', 'categoria'])['login'].count())
with pd.option_context('display.float_format', '{:.0f}'.format):
display(done.groupby(['regione', 'categoria'])['studenti'].sum())
Cartografia ISTAT 2011 (fonte: http://www.istat.it/it/archivio/24613), convertita con il comando:
ogr2ogr -f GeoJSON -s_srs reg2011_g.prj -t_srs EPSG:4326 it.json reg2011_g.shp
import geopandas as gpd
%matplotlib inline
it = gpd.read_file("it.json")
TYPES = ['totale'] + list(snames.values())
dreg = done.groupby(['regione']).count()
dregk = done.groupby(['regione','categoria']).count()
sreg = done.groupby(['regione']).sum()
sregk = done.groupby(['regione','categoria']).sum()
def get_data_with_default(geo, i, t, ddata, sdata, jj, labeld='login', labels='studenti'):
try:
geo.loc[i, 'squadre' + ' ' + t] = 0
for j in jj:
geo.loc[i, 'squadre' + ' ' + t] += ddata.loc[j, labeld] if ddata.loc[j, labeld] > 0 else 0
except:
geo.loc[i, 'squadre' + ' ' + t] += 0
try:
geo.loc[i, 'studenti' + ' ' + t] = 0
for j in jj:
geo.loc[i, 'studenti' + ' ' + t] += sdata.loc[j, labels] if sdata.loc[j, labels] > 0 else 0
except:
geo.loc[i, 'studenti' + ' ' + t] += 0
for i, r in it.iterrows():
for cname in istat.index:
if r['NOME_REG'][0:5] == cname[0:5]:
it.loc[i, 'NOME_REG'] = cname
get_data_with_default(it, i, TYPES[0], dreg, sreg, [cname])
get_data_with_default(it, i, TYPES[1], dregk, sregk, [(cname, 'kilo')])
get_data_with_default(it, i, TYPES[2], dregk, sregk, [(cname, 'mega'), (cname, 'giga')])
get_data_with_default(it, i, TYPES[3], dregk, sregk, [(cname, 'tera'), (cname, 'peta')])
it.loc[i, 'popolazione ' + TYPES[0]] = istat.loc[cname, 'totale']
it.loc[i, 'popolazione ' + TYPES[1]] = istat.loc[cname, snames['E']]
it.loc[i, 'popolazione ' + TYPES[2]] = istat.loc[cname, snames['M']]
it.loc[i, 'popolazione ' + TYPES[3]] = istat.loc[cname, snames['S']]
break
for t in TYPES:
it['copertura ' + t] = 1000 * it['studenti ' + t] / it['popolazione ' + t]
fig, ax = plt.subplots(2,2)
fig.set_size_inches(15,11)
for i, t in enumerate(TYPES):
r = i // 2
c = i % 2
ax[r][c].set_aspect("equal")
ax[r][c].set_axis_off()
ax[r][c].set_title("Studenti ogni mille ({})".format(t))
it.plot(ax=ax[r][c], column='copertura ' + t, cmap='YlOrRd', scheme='quantiles', legend=True)
fig.savefig('italia.pdf')
plt.show()
w = gpd.read_file("world.json")
w = w.set_index("name")
with open("wbebras.json", "r") as t:
wbebras = pd.DataFrame(pd.read_json(t, convert_axes=True, orient='index'))
wbebras['copertura'] = 1000 * wbebras["bebras"] / wbebras["oecd"]
for i in wbebras.index:
try:
w.loc[i, "bebras"] = wbebras.loc[i, "bebras"]
w.loc[i, "oecd"] = wbebras.loc[i, "oecd"]
w.loc[i, "copertura"] = wbebras.loc[i, "copertura"]
except:
print(i)
plt.figure(figsize=(20,20))
ax = plt.subplot(212)
ax.set_aspect("equal")
ax.set_axis_off()
ax.set_title("Partecipanti ogni 1000 studenti (dati OECD 2015)")
w.dropna().plot(ax=ax,column='copertura', cmap='Blues', scheme='quantiles', legend=True)
ax = plt.subplot(211)
ax.set_aspect("equal")
ax.set_axis_off()
ax.set_title("Partecipanti Bebras 2017")
p = w.dropna(subset=["bebras"]).plot(ax=ax,column='bebras', cmap='YlOrRd', scheme='quantiles', legend=True)
plt.show()
display(wbebras.sort_values("bebras",ascending=False)[["bebras","oecd","copertura"]])
print("In totale nel mondo {} partecipanti".format(wbebras['bebras'].sum()))