from IPython.display import HTML, Markdown
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<input type="button" value="Clicca per vedere/nascondere il codice Python" onclick="code_toggle()">''')
%matplotlib inline
import pandas as pd
import urllib.request
from IPython.display import display, Markdown
import matplotlib.pyplot as plt
pd.options.display.max_rows = None
plt.style.use('ggplot')
with open('secret.key') as k:
key = k.readline().strip()
r = urllib.request.urlopen("https://bebras.it/api?key={}&view=teachers".format(key))
with open("teachers.json", "w") as tw:
tw.writelines(r.read().decode('utf-8'))
teachers = pd.DataFrame(pd.read_json("teachers.json", convert_axes=True))[3:]
teachers.index = range(len(teachers))
teachers['confirm_time'] = pd.to_datetime(teachers['confirm_time'], unit='s')
filledt = teachers[(teachers['subscription'] > 0) &
(teachers['confirm_time'] > pd.datetime(2017,9,18))]
teachers['school_code'] = teachers['school_code'].str.strip().str.upper()
filled = len(filledt)
expected = filledt.sum()
regteams = expected['teams_active']
today = pd.datetime.today()
s = """*{}:* **{:d}** insegnanti hanno confermato la partecipazione;
ci sono **{:d}** squadre già registrate (~*{:d}* alunni).
Nuovi insegnanti iscritti dal 18/9: **{:d}**.
"""
display(Markdown(s.format(str(today)[:19],
filled, regteams, regteams*4,
len(teachers) - 1235)))
if today < pd.datetime(2017,11,5):
isotoday = today.isoformat()[:10]
with open("stats-" + isotoday + ".txt", "w") as stat:
stat.write("{:d} {:d} {:d}\n".format(filled, regteams, regteams*4))
institutes = teachers[(teachers['school_code'].str.strip() != "")
& (teachers['subscription'] > 0)
& (teachers['confirm_time'] > pd.datetime(2017,9,18))].groupby('school_code')['id'].count()
print("Totale istituti con codice meccanografico: {}; numero medio insegnanti {:.2f}".format(len(institutes), institutes.mean()))
import os
data = []
for path, dirs, files in os.walk("."):
for f in files:
if path == '.' and f.startswith("stats-"):
d = [int(x) for x in f.split('.')[0].split('-')[1:4]]
with open(f,"r") as df:
nn = [int(x) for x in df.readline().strip().split(" ")]
d = pd.datetime(2017, 11, 13) - pd.datetime.fromtimestamp(os.stat(f).st_mtime)
data.append((d, nn))
data = pd.DataFrame.from_items(data, orient="index",
columns=["insegnanti","squadre","alunni"]).sort_index(ascending=False)
olddata = []
for path, dirs, files in os.walk("old"):
for f in files:
if f.startswith("stats-"):
d = [int(x) for x in f.split('.')[0].split('-')[1:4]]
with open(path + "/" + f,"r") as df:
nn = [int(x) for x in df.readline().strip().split(" ")]
olddata.append((pd.datetime(2016,11,6) - pd.datetime(*d), nn))
olddata = pd.DataFrame.from_items(olddata, orient="index",
columns=["insegnanti","squadre","alunni"]).sort_index(ascending=False)
fig, ax = plt.subplots(1,2)
fig.set_size_inches(11,5)
for i, t in enumerate(['squadre', 'insegnanti']):
data[t].plot(ax=ax[i], legend=True)
olddata[t].plot(ax=ax[i], legend=True, style='--', label=t + ' 2016')
delta = (data[t].max()-olddata[t].max())/olddata[t].max()
ax[i].text(.7*data[t].count(), .9*data[t].max(), '{:+.1f}%'.format(delta*100), color='tomato')
plt.show()
Dati ISTAT popolazione studentesca 2014 (fonte: http://dati.istat.it)
istat = pd.DataFrame.from_items([
("PIEMONTE", (191399, 117997, 168439)),
("VALLE D'AOSTA", ( 5981, 3691, 5309)),
("LIGURIA", ( 61566, 39213, 60184)),
("LOMBARDIA", (468662, 283007, 381619)),
("TRENTINO-ALTO ADIGE", ( 27028, 16890, 21836)),
("VENETO", (232694, 142401, 204262)),
("FRIULI-VENEZIA GIULIA", ( 51830, 32143, 46949)),
("EMILIA-ROMAGNA", (198417, 118460, 176968)),
("TOSCANA", (161001, 98203, 152886)),
("UMBRIA", ( 39181, 23488, 36946)),
("MARCHE", ( 67996, 42095, 70602)),
("LAZIO", (268133, 161573, 249145)),
("ABRUZZO", ( 57146, 35828, 58578)),
("MOLISE", ( 12595, 8354, 14990)),
("CAMPANIA", (317346, 204223, 326644)),
("PUGLIA", (198662, 130675, 213545)),
("BASILICATA", (25237, 17097, 30214)),
("CALABRIA", (93277, 59624, 101208)),
("SICILIA", (254023, 164520, 252730)),
("SARDEGNA", (67379, 44105, 74003)),
("ESTERO", (pd.np.NaN, pd.np.NaN, pd.np.NaN))],
orient = "index",
columns = ('E','M','S'))
istat['totale'] = istat['E'] + istat['M'] + istat['S']
display(istat)
miur = pd.read_csv('bebras_school_list.zip', low_memory=False)
def norm_region(r):
"""Normalize the name of a region. It also corrects wrong names."""
r = r.strip().upper()
if r == 'FVG' or r.startswith('FRIULI'):
return 'FRIULI-VENEZIA GIULIA'
if r.startswith('EMILIA'):
return 'EMILIA-ROMAGNA'
if r.startswith('TRENTINO'):
return 'TRENTINO-ALTO ADIGE'
if r.startswith('LOMB'):
return 'LOMBARDIA'
elif r == 'ALBANIA' or r == 'BAVIERA' or r == 'SIERRA' or r == 'DDDD':
return 'ESTERO'
else:
return r
def infer_school_type(k):
knorm = k['school_kind'].strip().upper()
cnorm = k['school_code'].strip().upper()
if cnorm and miur[miur['i_code'] == cnorm]['i_type'].count() > 0:
knorm = str(miur[miur['i_code'] == cnorm]['i_type'].iloc[0])
if 'PRIMARIA' in knorm or 'INFANZIA' in knorm or 'ELEMENTARE' in knorm:
return 'E'
if 'PRIMO GRADO' in knorm or ('MEDIA' in knorm and (not 'SUP' in knorm))\
or '1°' in knorm or ' I GRADO' in knorm or knorm == 'IC':
return 'M'
if 'COMPRENSIVO' in knorm:
return 'EM'
if 'SECONDO GRADO' in knorm or '2°' in knorm or 'II GRADO' in knorm \
or 'LICEO' in knorm or 'ITI' in knorm or 'PROF' in knorm or 'IST TEC' in knorm \
or 'TECNICO' in knorm or 'MAGISTRALE' in knorm or 'SUPERIORE' in knorm:
return 'S'
if knorm == 'STATALE' or 'C.D.38':
return 'EMS'
else:
return knorm
stat = pd.DataFrame()
stat['regione'] = filledt['school_region'].map(norm_region)
stat['tipo'] = filledt[['school_kind','school_code']].apply(infer_school_type, axis=1)
stat['squadre attese'] = filledt['teams_active']
expected = stat.groupby(['regione', 'tipo']).aggregate('sum')
for (reg, tipo), row in expected.iterrows():
if len(tipo) > 1:
for t in tipo:
try:
expected.loc[(reg, t), 'squadre attese'] += row[0] / len(tipo)
except:
expected.loc[(reg, t), 'squadre attese'] = row[0] / len(tipo)
try:
expected.loc[(reg, t), 'popolazione'] = istat.loc[reg, t]
except:
print(":{}:{}:NOT FOUND".format(reg, t))
expected.loc[(reg, t), 'popolazione'] = pd.np.NaN
else:
try:
expected.loc[(reg, tipo), 'popolazione'] = istat.loc[reg, tipo]
except:
print("_{}_{}_NOT FOUND".format(reg, tipo))
expected.loc[(reg, tipo), 'popolazione'] = pd.np.NaN
expected = expected[expected.index.isin(['E','M','S'], level=1)].sort_index()
expected['alunni attesi'] = expected['squadre attese'] * 4
expected['copertura (alunni ogni mille)'] = 1000 * expected['alunni attesi'] / expected['popolazione']
display(expected)
tot = expected[['squadre attese','alunni attesi', 'popolazione']].groupby(level='tipo').sum()
tot['copertura (alunni ogni mille)'] = 1000 * tot['alunni attesi'] / tot['popolazione']
display(tot)
glob = tot.sum()
print("""squadre attese: {}\t alunni attesi: {}
popolazione: {}\t copertura (alunni ogni mille) {:0.1f}""".format(int(glob["squadre attese"]),
int(glob["alunni attesi"]),
int(glob["popolazione"]),
1000 * glob["alunni attesi"] / glob["popolazione"]))
exp_reg = expected[['squadre attese','alunni attesi', 'popolazione']].groupby(level='regione').sum()
exp_reg['copertura (alunni ogni mille)'] = 1000 * exp_reg['alunni attesi'] / exp_reg['popolazione']
display(exp_reg)
Cartografia ISTAT 2011 (fonte: http://www.istat.it/it/archivio/24613), convertita con il comando:
ogr2ogr -f GeoJSON -s_srs reg2011_g.prj -t_srs EPSG:4326 it.json reg2011_g.shp
%%capture _
!conda install -y -c conda-forge geopandas
import geopandas as gpd
%matplotlib inline
it = gpd.read_file("it.json")
TYPES = ('totale', 'primaria', 'secondaria primo grado', 'secondaria secondo grado')
def get_data_with_default(geo, i, t, data, j, label='squadre attese'):
try:
geo.loc[i, label + ' ' + t] = data.loc[j, label]
except:
geo.loc[i, label + ' ' + t] = 0
finally:
return geo.loc[i, label + ' ' + t]
for i, r in it.iterrows():
for cname in istat.index:
if r['NOME_REG'][0:5] == cname[0:5]:
it.loc[i, 'NOME_REG'] = cname
get_data_with_default(it, i, TYPES[0], exp_reg, cname)
get_data_with_default(it, i, TYPES[1], expected, (cname, 'E'))
get_data_with_default(it, i, TYPES[2], expected, (cname, 'M'))
get_data_with_default(it, i, TYPES[3], expected, (cname, 'S'))
it.loc[i, 'popolazione ' + TYPES[0]] = istat.loc[cname, 'totale']
it.loc[i, 'popolazione ' + TYPES[1]] = istat.loc[cname, 'E']
it.loc[i, 'popolazione ' + TYPES[2]] = istat.loc[cname, 'M']
it.loc[i, 'popolazione ' + TYPES[3]] = istat.loc[cname, 'S']
break
for t in TYPES:
it['alunni attesi ' + t] = it['squadre attese ' + t] * 4
it['copertura ' + t] = 1000 * it['alunni attesi ' + t] / it['popolazione ' + t]
fig, ax = plt.subplots(2,2)
fig.set_size_inches(15,11)
for i, t in enumerate(TYPES):
r = i // 2
c = i % 2
ax[r][c].set_aspect("equal")
ax[r][c].set_axis_off()
ax[r][c].set_title("Alunni attesi ogni mille ({})".format(t))
it.plot(ax=ax[r][c], column='copertura ' + t, cmap='YlOrRd', scheme='quantiles', legend=True)
fig.savefig('italia.pdf')
plt.show()
w = gpd.read_file("world.json")
w = w.set_index("name")
with open("wbebras.json", "r") as t:
wbebras = pd.DataFrame(pd.read_json(t, convert_axes=True, orient='index'))
wbebras['copertura'] = 1000 * wbebras["bebras"] / wbebras["oecd"]
for i in wbebras.index:
try:
w.loc[i, "bebras"] = wbebras.loc[i, "bebras"]
w.loc[i, "oecd"] = wbebras.loc[i, "oecd"]
w.loc[i, "copertura"] = wbebras.loc[i, "copertura"]
except:
print(i)
plt.figure(figsize=(20,20))
ax = plt.subplot(212)
ax.set_aspect("equal")
ax.set_axis_off()
ax.set_title("Partecipanti 2016 ogni 1000 studenti (dati OECD 2015)")
w.dropna().plot(ax=ax,column='copertura', cmap='Blues', scheme='quantiles', legend=True)
ax = plt.subplot(211)
ax.set_aspect("equal")
ax.set_axis_off()
ax.set_title("Partecipanti Bebras 2016")
p = w.dropna(subset=["bebras"]).plot(ax=ax,column='bebras', cmap='YlOrRd', scheme='quantiles', legend=True)
display(wbebras.sort_values("bebras",ascending=False)[["bebras","oecd","copertura"]])
print("In totale nel mondo {} partecipanti".format(wbebras['bebras'].sum()))
CATS = ('kilo', 'mega', 'giga', 'tera', 'peta')
for i, k in enumerate(CATS):
r = urllib.request.urlopen("https://bebras.it/api?key={}&view=exams&test={}&examdata=0".format(key,50+i))
with open("overview-{}.json".format(k), "w") as tw:
tw.writelines(r.read().decode('utf-8'))
import json
overview = []
for k in CATS:
with open("overview-{}.json".format(k), "r") as t:
j = json.load(t)
overview += j['exams']
dfov = pd.DataFrame(overview)
gare = pd.DataFrame()
gare['categoria'] = dfov['category'].str.lower().astype('category', categories=CATS, ordered=True)
gare['insegnante'] = dfov['teacher_id'].astype('int64')
gare['login'] = dfov['login']
gare['status'] = dfov['exam_valid_score']
gare['risultato'] = dfov['score']
gare['data'] = pd.to_datetime(dfov['time'])
fid = filledt.set_index('id')
fid['regione'] = fid['school_region'].map(norm_region)
gare = gare.join(fid[['regione']],on='insegnante')
done = gare[gare['status'] == 1]
len(done.groupby(['insegnante']))
display(done.groupby(['regione'])['insegnante'].nunique())
display(done.groupby(['categoria'])['insegnante'].nunique())
done.groupby(['regione', 'categoria']).count()