from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<input type="button" value="Clicca per vedere/nascondere il codice Python" onclick="code_toggle()">''')
import pandas as pd
import urllib2
from IPython.display import display, Markdown
try:
with open('secret.key') as k:
key = k.readline().strip()
t = urllib2.urlopen(key)
with open("teachers.json", "w") as tw:
tw.writelines(t)
t = urllib2.urlopen(key.replace("teachers", "overview"))
with open("overview.json", "w") as tw:
tw.writelines(t)
except:
print "Caricamento dati dalla cache"
teachers = None
with open("teachers.json", "r") as t:
teachers = pd.DataFrame(pd.read_json(t, convert_axes=True))[3:]
teachers.index = range(len(teachers))
# considera solo chi ha compilato teams_expected e il tipo di scuola
filledt = teachers[((teachers['teams_expected'] > 0) |
(teachers['teams_active'] > 0)) &
(teachers['school_type'].notnull()) &
(teachers['school_type'] != '')]
filled = len(filledt)
expected = filledt.sum()
expteams = expected['teams_expected']
regteams = expected['teams_active']
today = pd.datetime.today()
s = """Alla data del *{}:* **{:d}** insegnanti hanno stimato il numero di squadre, per un totale di **{:d}** squadre (~*{:d}* alunni);
ci sono **{:d}** squadre già registrate (~*{:d}* alunni)."""
display(Markdown(s.format(str(today)[:19], filled, expteams, expteams*4, regteams, regteams*4)))
if today < pd.datetime(2016,11,7):
isotoday = today.isoformat()[:10]
with open("stats-" + isotoday + ".txt", "w") as stat:
stat.write("{:d} {:d} {:d}\n".format(filled, expteams, expteams*4))
import os
data = []
for path, dirs, files in os.walk("."):
for f in files:
if f.startswith("stats-"):
d = map(int, f.split('.')[0].split('-')[1:4])
with open(f,"r") as df:
nn = map(int, df.readline().strip().split(" "))
d = pd.datetime(*d)
data.append((d, nn))
data = pd.DataFrame.from_items(data, orient="index", columns=["insegnanti","squadre","alunni"])
lasty_teachers = 66+82+124
lasty_teams = 3465
lasty = pd.DataFrame.from_items([(pd.datetime(2016,9,15),[0,0,0]),
(pd.datetime(2016,11,1),[lasty_teachers,lasty_teams,lasty_teams*4])],
orient="index",
columns=["insegnanti 2015 (interpolazione lineare)",
"squadre 2015 (interpolazione lineare)",
"alunni 2015 (interpolazione lineare)"])
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
plt.figure(figsize=(12,5))
plt.subplot(121)
ax = data['squadre'].plot(legend=True)
data['alunni'].plot(ax=ax, legend=True)
lasty['alunni 2015 (interpolazione lineare)'].plot(style="--", ax=ax, legend=True)
lasty['squadre 2015 (interpolazione lineare)'].plot(style="--", ax=ax, legend=True)
ax.axvline(x=pd.datetime(2016,11,1), linewidth=.5, color='gray')
plt.subplot(122)
ax = data['insegnanti'].plot(legend=True)
lasty['insegnanti 2015 (interpolazione lineare)'].plot(style="--", ax=ax, legend=True)
p = ax.axvline(x=pd.datetime(2016,11,1), linewidth=.5, color='gray')
Si considera il numero di account riutilizzati: il numero di insegnanti che ripetono l'esperienza è quindi stimato per difetto, in quanto un insegnante iscritto con una nuova username risulta nuovo.
print "Account riutilizzati: {:0.1f}%".format(100*len(teachers[(teachers["teams_active"] > 0) & (teachers["id"] <= 324)]) / float(lasty_teachers))
Dati ISTAT popolazione studentesca 2014 (fonte: http://dati.istat.it)
istat = pd.DataFrame.from_items([
("PIEMONTE", (191399, 117997, 168439)),
("VALLE D'AOSTA", ( 5981, 3691, 5309)),
("LIGURIA", ( 61566, 39213, 60184)),
("LOMBARDIA", (468662, 283007, 381619)),
("TRENTINO-ALTO ADIGE", ( 27028, 16890, 21836)),
("VENETO", (232694, 142401, 204262)),
("FRIULI-VENEZIA GIULIA", ( 51830, 32143, 46949)),
("EMILIA-ROMAGNA", (198417, 118460, 176968)),
("TOSCANA", (161001, 98203, 152886)),
("UMBRIA", ( 39181, 23488, 36946)),
("MARCHE", ( 67996, 42095, 70602)),
("LAZIO", (268133, 161573, 249145)),
("ABRUZZO", ( 57146, 35828, 58578)),
("MOLISE", ( 12595, 8354, 14990)),
("CAMPANIA", (317346, 204223, 326644)),
("PUGLIA", (198662, 130675, 213545)),
("BASILICATA", (25237, 17097, 30214)),
("CALABRIA", (93277, 59624, 101208)),
("SICILIA", (254023, 164520, 252730)),
("SARDEGNA", (67379, 44105, 74003)),
("ESTERO", (pd.np.NaN, pd.np.NaN, pd.np.NaN))],
orient = "index",
columns = ('E','M','S'))
istat['totale'] = istat['E'] + istat['M'] + istat['S']
display(istat)
def norm_region(r):
"""Normalize the name of a region. It also corrects wrong names."""
r = r.strip().upper()
if r == 'FVG' or r.startswith('FRIULI'):
return 'FRIULI-VENEZIA GIULIA'
if r.startswith('EMILIA'):
return 'EMILIA-ROMAGNA'
if r.startswith('TRENTINO'):
return 'TRENTINO-ALTO ADIGE'
elif r == 'ALBANIA' or r == 'BAVIERA':
return 'ESTERO'
else:
return r
stat = pd.DataFrame()
stat['regione'] = filledt['school_region'].map(norm_region)
stat['tipo'] = filledt['school_type']
stat['squadre attese'] = filledt['teams_expected']
expected = stat.groupby(['regione', 'tipo']).aggregate('sum')
for (reg, tipo), row in expected.iterrows():
if len(tipo) > 1:
for t in tipo:
try:
expected.loc[(reg, t), 'squadre attese'] += row[0] / len(tipo)
except:
expected.loc[(reg, t), 'squadre attese'] = row[0] / len(tipo)
try:
expected.loc[(reg, t), 'popolazione'] = istat.loc[reg, t]
except:
print ":{}:{}:NOT FOUND".format(reg, t)
expected.loc[(reg, t), 'popolazione'] = pd.np.NaN
else:
try:
expected.loc[(reg, tipo), 'popolazione'] = istat.loc[reg, tipo]
except:
print "_{}_{}_NOT FOUND".format(reg, tipo)
expected.loc[(reg, tipo), 'popolazione'] = pd.np.NaN
expected = expected[expected.index.isin(['E','M','S'], level=1)].sort_index()
expected['alunni attesi'] = expected['squadre attese'] * 4
expected['copertura (alunni ogni mille)'] = 1000 * expected['alunni attesi'] / expected['popolazione']
display(expected)
tot = expected[['squadre attese','alunni attesi', 'popolazione']].groupby(level='tipo').sum()
tot['copertura (alunni ogni mille)'] = 1000 * tot['alunni attesi'] / tot['popolazione']
display(tot)
glob = tot.sum()
print """squadre attese: {}\t alunni attesi: {}
popolazione: {}\t copertura (alunni ogni mille) {:0.1f}""".format(int(glob["squadre attese"]),
int(glob["alunni attesi"]),
int(glob["popolazione"]),
1000 * glob["alunni attesi"] / glob["popolazione"])
exp_reg = expected[['squadre attese','alunni attesi', 'popolazione']].groupby(level='regione').sum()
exp_reg['copertura (alunni ogni mille)'] = 1000 * exp_reg['alunni attesi'] / exp_reg['popolazione']
display(exp_reg)
Cartografia ISTAT 2011 (fonte: http://www.istat.it/it/archivio/24613), convertita con il comando:
ogr2ogr -f GeoJSON -s_srs reg2011_g.prj -t_srs EPSG:4326 it.json reg2011_g.shp
import geopandas as gpd
it = gpd.read_file("it.json")
TYPES = ('totale', 'primaria', 'secondaria primo grado', 'secondaria secondo grado')
def get_data_with_default(geo, i, t, data, j, label='squadre attese'):
try:
geo.loc[i, label + ' ' + t] = data.loc[j, label]
except:
geo.loc[i, label + ' ' + t] = 0
finally:
return geo.loc[i, label + ' ' + t]
for i, r in it.iterrows():
for cname in istat.index:
if r['NOME_REG'][0:5] == cname[0:5]:
it.loc[i, 'NOME_REG'] = cname
get_data_with_default(it, i, TYPES[0], exp_reg, cname)
get_data_with_default(it, i, TYPES[1], expected, (cname, 'E'))
get_data_with_default(it, i, TYPES[2], expected, (cname, 'M'))
get_data_with_default(it, i, TYPES[3], expected, (cname, 'S'))
it.loc[i, 'popolazione ' + TYPES[0]] = istat.loc[cname, 'totale']
it.loc[i, 'popolazione ' + TYPES[1]] = istat.loc[cname, 'E']
it.loc[i, 'popolazione ' + TYPES[2]] = istat.loc[cname, 'M']
it.loc[i, 'popolazione ' + TYPES[3]] = istat.loc[cname, 'S']
break
for t in TYPES:
it['alunni attesi ' + t] = it['squadre attese ' + t] * 4
it['copertura ' + t] = 1000 * it['alunni attesi ' + t] / it['popolazione ' + t]
plt.figure(figsize=(15,15))
for i, t in enumerate(TYPES):
ax = plt.subplot(2,2, i+1)
ax.set_aspect("equal")
ax.set_axis_off()
ax.set_title("Alunni attesi ogni mille ({})".format(t))
it.plot(ax=ax, column='copertura ' + t, cmap='YlOrRd', scheme='quantiles', legend=True)
w = gpd.read_file("world.json")
w = w.set_index("name")
with open("wbebras.json", "r") as t:
wbebras = pd.DataFrame(pd.read_json(t, convert_axes=True, orient='index'))
for i in wbebras.index:
try:
w.loc[i, "bebras"] = wbebras.loc[i, "bebras"]
w.loc[i, "oecd"] = wbebras.loc[i, "oecd"]
w.loc[i, "copertura"] = 1000 * wbebras.loc[i, "bebras"] / wbebras.loc[i, "oecd"]
except:
print i
plt.figure(figsize=(10,10))
ax = plt.subplot(212)
ax.set_aspect("equal")
ax.set_axis_off()
ax.set_title("Partecipanti 2015 ogni 1000 studenti (dati OECD 2012)")
w.dropna().plot(ax=ax,column='copertura', cmap='Blues', scheme='quantiles', legend=True)
ax = plt.subplot(211)
ax.set_aspect("equal")
ax.set_axis_off()
ax.set_title("Partecipanti Bebras 2015")
p = w.dropna(subset=["bebras"]).plot(ax=ax,column='bebras', cmap='YlOrRd', scheme='quantiles', legend=True)
display(wbebras.sort_values("bebras",ascending=False)["bebras"])
import json
overview = None
with open("overview.json", "r") as t:
overview = json.load(t)
dfov = pd.DataFrame(overview['teams'])
gare = pd.DataFrame()
gare['categoria'] = dfov['u_class'].str.extract('(.+)_.+', expand=False)
gare['insegnante'] = dfov['t_id'].astype('int64')
gare['login'] = dfov['u_id']
gare['status'] = dfov['u_investigation']
gare['risultato'] = dfov['view_exam_list'].str.extract('(\d+)p/\d+min', expand=False)
gare['tempo'] = dfov['view_exam_list'].str.extract('\d+p/(\d+)min', expand=False)
gare['data'] = pd.to_datetime(dfov['view_exam_list'].str.extract('Server End Date: ([0-9/ :]+)', expand=False))
#gare['prova'] = pd.to_datetime(dfov['11'].astype('int64'), unit='s')
#gare['prova1'] = pd.to_datetime(dfov['15'].fillna(0).astype('int64'), unit='s')
#gare['prova2'] = pd.to_datetime(dfov['e_last_creation_date'].fillna(0).astype('int64'), unit='s')
#gare['prova3'] = pd.to_datetime(dfov['u_lastlogindate'].astype('int64'), unit='s')
fid = filledt.set_index('id')
fid['regione'] = fid['school_region'].map(norm_region)
gare = gare.join(fid[['regione']],on='insegnante')
done = gare[gare['status'] != 'Empty']
display(done.groupby(['regione'])['insegnante'].nunique())
display(done.groupby(['categoria'])['insegnante'].nunique())
dcount = done.groupby(['regione', 'categoria']).count()
# manuale per forzare l'ordine
TYPES = ('kilo', 'mega', 'giga', 'tera', 'peta') # done['categoria'].unique()
for i, r in it.iterrows():
for cname in istat.index:
if r['NOME_REG'][0:5] == cname[0:5]:
totale = 0
for t in TYPES:
totale += get_data_with_default(it, i, t, dcount, (cname, t), label='login')
it.loc[i, "Squadre totali"] = totale
break
plt.figure(figsize=(16,16))
for i, t in enumerate(TYPES):
ax = plt.subplot(2,3, i+1)
ax.set_aspect("equal")
ax.set_axis_off()
ax.set_title("Squadre ({})".format(t))
it.plot(ax=ax, column='login ' + t, cmap='YlOrRd', scheme='quantiles', legend=True)
ax = plt.subplot(2,3,6)
ax.set_aspect("equal")
ax.set_axis_off()
ax.set_title("Squadre ({})".format('totale'))
p = it.plot(ax=ax, column='Squadre totali', cmap='Blues', scheme='quantiles', legend=True)