Statistiche gare Bebras italiano 2021¶

from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<input type="button" value="Clicca per vedere/nascondere il codice Python" onclick="code_toggle()">''')

import warnings
#warnings.filterwarnings('once')
warnings.filterwarnings('ignore')

Distribuzione dei punteggi¶

import pandas as pd
import json, hashlib, urllib, os.path

pd.options.display.max_rows = None
pd.options.display.max_columns = None

CATS = ('kilo','mega','giga','tera','peta')
SUBS = ('single', 'double')
CATEGORIES = tuple(f'{c}-{s}' for c in CATS for s in SUBS)
CAT_FILES = tuple(f'{c}-{s}' for s in SUBS for c in CATS)

with open('secret.key') as k:
    key = k.readline().strip()

for i, k in enumerate(CAT_FILES):
    if not os.path.exists(f"results-{k}.json"):
        r = urllib.request.urlopen(f"https://bebras.it/api?key={key}&view=exams&edition=bebras_2021&events=0&test={98+i}")
        with open(f"results-{k}.json", "w") as tw:
            tw.writelines(r.read().decode('utf-8'))

score = []
for k in CATEGORIES:
    with open(f"results-{k}.json", "r") as t:
        j = json.load(t)
        score += j['exams']

scoredf = pd.DataFrame(score)

# L'orario va corretto per il fuso orario

scoredf['server_start'] = pd.to_datetime(scoredf['exam_date'].astype('int64') + 60*60, unit='s')
scoredf['orainizio'] = pd.np.floor((scoredf['exam_date'].astype('int64') + 60*60) / (45*60)) # ore da 45', il tempo di gara
scoredf['punteggio'] = pd.to_numeric(scoredf['score'])
scoredf['punteggio_norm'] = scoredf['punteggio'].map(lambda x: x if x >= 0 else 0)
scoredf['anonid'] = scoredf['team_id'].map(lambda x: hashlib.md5(str(x).encode('utf8')).hexdigest())
scoredf['categoria'] = scoredf['category'].str.lower().astype(pd.api.types.CategoricalDtype(categories = CATEGORIES, ordered=True))

valid = scoredf[scoredf['exam_valid_score'] == 1]
valid.to_csv('anonris.csv', columns=['anonid', 'categoria', 'orainizio', 'punteggio', 'punteggio_norm', 'time'])

from IPython.display import display, Markdown

txt = '''<table>
<caption>Squadre partecipanti al Bebras 2021/22 con risultati correttamente registrati</caption>
<thead>
  <tr><th>Categoria</th>
  <th>squadre</th>
  <th> min </th>
  <th> max </th>
  <th> media </th>
  <th> std.dev. </th>
  <th>I quartile </th>
  <th>mediana </th>
  <th>III quartile</th>
  <th>Squadre al minimo</th>
  <th>Squadre al massimo</th>
</tr>
<tbody>
'''
for k in valid['categoria'].unique().sort_values():
    s = valid[valid['categoria'] == k]['punteggio_norm'].describe()
    top = valid[(valid['categoria'] == k) & (valid['punteggio_norm'] == int(s['max']))]
    bottom = valid[(valid['categoria'] == k) & (valid['punteggio_norm'] == int(s['min']))]
    txt += "<tr><th>{}</th><td>{}</td><td>{}</td><td>{}</td><td>{:.1f}</td>\
<td>{:3.1f}</td><td>{}</td><td>{}</td><td>{}</td><td>{:.1f}%</td><td>{:.1f}%</td></tr>".format(k, 
                                                              int(s['count']),
                                                              int(s['min']),
                                                              int(s['max']),
                                                              float(s['mean']),
                                                              float(s['std']),
                                                              int(s['25%']), 
                                                              int(s['50%']), 
                                                              int(s['75%']),
                                                              100*len(bottom)/float(s['count']),
                                                              100*len(top)/float(s['count']))
txt += '<tfoot><tr><th>Totale</th><td>{}</td></tr>'.format(valid['punteggio_norm'].count())
txt += '</table>'
display(Markdown(txt))

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

histograms = valid['punteggio_norm'].hist(by=valid['categoria'], bins=24, figsize=(10,16), layout=(5, 2))

Percentili per punteggio¶

for k in valid['categoria'].unique().sort_values():
    tot = float(valid[(valid['categoria'] == k)]['punteggio'].count())
    top = int(valid[(valid['categoria'] == k)]['punteggio'].max())
    pp = [100 * valid[(valid['categoria'] == k) & (valid['punteggio'] < i)]['punteggio'].count()/tot for i in range(1,top+1)]
    txt = '''<table>
    <caption>Percentili per la categoria {} (che percentuale di squadre si supera con un dato punteggio)</caption>
    <thead>'''.format(k)
    txt += ''.join(['<td>{}</td>'.format(i) for i in range(1,top+1)])
    txt += '<tbody>'
    txt += ''.join(['<td>{:.1f}</td>'.format(f) for f in pp])
    txt += '</table>'
    display(Markdown(txt))

Analisi delle risposte¶

rr = []
errors = 0
for r in valid.itertuples():
    for q in r.exam_data['questions']:
        try:
            t = dict((k, q[k]) for k in ('q_id','q_class','q_score','q_scoreMax','q_time'))
            t['anonid'] = r.anonid
            rr.append(t)
        except Exception as e:
            #print(q, e)
            errors += 1
print(errors)

6043

quiz = pd.DataFrame(rr)

MAPBEBRAS = dict((x.split('_')[-1], x.split('_')[1]) for x in list(quiz['q_id'].unique()))

MAPNAMES = {
    'Q01': 'Lettere',
    'Q02': 'Vestito da ballo',
    'Q03': 'Assistente virtuale',
    'Q04': 'Timbri',
    'Q05': 'Mappa concettuale',
    'Q06': 'Sequenze di DNA',
    'Q07': 'Collane',
    'Q08': 'Display difettoso',
    'Q09': 'Fotografie del gatto',
    'Q10': 'Case nel villaggio',
    'Q11': 'Auto con guida autonoma',
    'Q12': 'Un gioco coi birilli',
    'Q13': "Ricostruire il DNA",
    'Q14': "Cuculi e nidi",
    'Q15': 'Topo-robot',
    'Q16': 'Canguri',
    'Q17': 'I gruppi di lavoro',
    'Q18': 'La pila di frutta',
    'Q19': 'Logistica museale',
    'Q20': 'Cenni del capo',
    'Q21': 'Disegni programmati',
    'Q22': 'Un codice compresso',
    'Q23': 'Rilevamento di guasti',
    'Q24': 'Andiamo in biblioteca',
    'Q25': "Di corsa all'incontro",
    'Q26': 'Salviamo gli alberi',
    'Q27': 'Piastrelle Truchet',
    'Q28': "Ada l'ingegnera",
}

quiz = quiz.rename(columns={'q_time': 'time', 'q_score': 'score', 'q_scoreMax': 'score_max', 'q_class': 'cat'})

quiz['nome'] = quiz['q_id'].str.extract('[0-9]+_(.+)', expand=False)
quiz['edizione'] = quiz['q_id'].str.extract('([0-9]+)_.+', expand=False)
quiz['completo'] = quiz['score'] == quiz['score_max']
quiz['parziale'] = (quiz['score'] > 0) & (quiz['score'] != quiz['score_max'])
quiz['penalizzato'] = quiz['score'] < 0
quiz['voto'] = quiz['score'] / quiz['score_max'].astype('float64')
quiz['minuti'] = quiz['time'].map(lambda x: float(x)/60. if float(x) >= 0 and float(x) <= 45*60 else pd.np.NaN)

#quiz.to_csv('quiz.csv', columns=['anonid', 'cat', 'edizione', 'nome', 'bebras', 'score', 'score_max', 'time'])

vquiz = pd.merge(valid[['anonid', 'categoria', 'punteggio','punteggio_norm','orainizio','teacher_id','school_cap']], quiz, on='anonid')

plt.figure(figsize=(16,40))

def bname(n):
    if n in MAPBEBRAS and n in MAPNAMES:
        return '{}'.format(MAPNAMES[n])
    else:
        return n

for j, k in enumerate(valid['categoria'].unique().sort_values()):
    plt.subplot(len(valid['categoria'].unique()), 1, j+1)
    plt.ylim(0,1.2)
    m = vquiz[vquiz['categoria'] == k].groupby('nome', 
                                             sort=False)[['completo','voto', 'parziale', 'penalizzato', 'minuti','score_max']].mean()
    m['vparziale'] = m['voto'] - m['completo']

    c = plt.bar(pd.np.arange(m.index.size), m['completo'], color='blue')
    p = plt.bar(pd.np.arange(m.index.size), m['parziale'], bottom=m['completo'], color='lightblue')   
    plt.xticks(pd.np.arange(m.index.size), map(bname, m.index.tolist()), rotation=45)
    plt.ylim([0,1])
    plt.yticks(pd.np.arange(0,1.2,.2), ['{:.0f}%'.format(100*y) for y in pd.np.arange(0,1.2,.2)])
    for i, y in enumerate(m['voto'].tolist()):
        plt.annotate(text='{:.0f}\''.format(m['minuti'].iloc[i]), xy=(i, .75*m['completo'].iloc[i]), color='white')
        plt.annotate(text='{:.0f}'.format(m['score_max'].iloc[i]), xy=(i-.15, .02), color='yellow', fontsize='x-large')
    plt.legend((c[0],p[0]), ('completo','parziale'), loc=(.92,.6))
    plt.title('{}: tassi di soluzione (il numero in alto indica i minuti spesi in media sul quesito, \
il numero in basso il punteggio massimo ottenibile)'.format(k))

plt.tight_layout()
plt.savefig('tassisol.png')

plt.figure(figsize=(16,40))

for j, k in enumerate(valid['categoria'].unique()):
    plt.subplot(len(valid['categoria'].unique()),1, j+1)
    plt.ylim(0,1.2)
    m = vquiz[vquiz['categoria'] == k].groupby('nome', 
                                             sort=False)[['completo','voto', 'parziale', 'penalizzato', 'minuti','score_max']].mean()
    m['vparziale'] = m['voto'] - m['completo']

    c = plt.bar(pd.np.arange(m.index.size), m['voto'], color='green')
    z = plt.bar(pd.np.arange(m.index.size), -m['penalizzato'], color='red')
    plt.ylim([-1,1])
    plt.yticks(pd.np.arange(-1,1.2,.2), ['{:.0f}%'.format(100*abs(y)) for y in pd.np.arange(-1,1.2,.2)])
 
    plt.xticks(pd.np.arange(m.index.size), map(bname, m.index.tolist()), rotation=45)
    for i, y in enumerate(m['voto'].tolist()):
        plt.annotate(text='{:.0f}'.format(m['score_max'].iloc[i]), xy=(i, -.8), color='blue')
    
    plt.legend((c[0],z[0]), ('punteggio','penalità'), loc=(0.91,.725))
    plt.title('{}: percentuale di punteggio attribuito in media, in rosso la percentuale di penalizzati (il numero in basso è il punteggio massimo)'.format(k))

plt.tight_layout()
plt.savefig('punti.png')

Analisi delle squadre¶

members = []
for r in valid.itertuples():
    if r.team_composition and 'members' in r.team_composition:
        for m in r.team_composition['members']:
            m['categoria'] = r.category.lower()
            m['team_id'] = r.team_id
            members.append(m)

pupils = pd.DataFrame(members)
pupils['genere'] = pupils['sex'].map(lambda x: x if x != '-' else pd.np.NaN)
pupils['categoria'] = pupils['categoria'].astype(pd.api.types.CategoricalDtype(categories = CATEGORIES, ordered=True))

gender = pupils[pupils['genere'].notnull()].groupby(['categoria', 'genere']).count()
txt = '''<table><caption>Studenti partecipanti al Bebras 2021 con risultati validi 
(i dati dipendono dalla corretta compilazione dei profili delle squadre)</caption>
<thead>
  <tr><th>Categoria</th>
  <th>studenti</th>
  <th>femmine</th>
  <th>maschi</th>
  <th>squadre con dati mancanti</th>
  <th>media componenti per squadra</th>
  </tr>
<tbody>
'''
notempty = pupils[pupils['genere'].notnull()].groupby('categoria')['team_id'].nunique()
empty = pupils[pupils['genere'].isnull()].groupby('categoria')['team_id'].nunique()

totf = 0
totm = 0
tot = 0
for k in pupils['categoria'].unique().sort_values():
    f = gender.loc[(k,'f')]['class']
    totf += f
    m = gender.loc[(k,'m')]['class']
    totm += m
    tot += f + m + empty[k]*(2 if '-double' in k else 1)
    
    txt += f'<tr><th>{k}</th><td>{f+m}</td><td>{f} ({100*float(f)/float(f+m):.1f}%)</td><td>{m} ({100*float(m)/float(f+m):.1f}%)</td><td>{empty[k]}</td><td>{float(f+m) / float(notempty[k]):.2f}</td></tr>'

txt += f'<tr><th>Totale:</th><td>{totf+totm}</td><td>{totf} ({100*float(totf)/float(totf+totm):.1f}%)</td><td>{totm} ({100*float(totm)/float(totf+totm):.1f}%)</td></tr>'
txt += f'<tr><th>Totale comprese squadre con dati mancanti</th><td>{tot}</td></tr>'
txt += '</table>'
display(Markdown(txt))

Categoria	squadre	min	max	media	std.dev.	I quartile	mediana	III quartile	Squadre al minimo	Squadre al massimo
kilo-single	1638	0	48	15.2	10.3	7	14	22	6.3%	0.4%
kilo-double	1828	0	48	18.9	9.9	12	18	25	2.2%	0.5%
mega-single	7294	0	48	12.5	9.6	5	11	18	10.1%	0.2%
mega-double	4281	0	48	14.7	10.0	7	13	21	6.5%	0.2%
giga-single	3302	0	46	9.9	8.2	3	8	15	12.1%	0.0%
giga-double	2042	0	48	11.4	8.7	5	10	17	11.1%	0.1%
tera-single	3037	0	48	14.9	9.0	8	14	21	4.0%	0.0%
tera-double	1603	0	45	14.9	9.0	8	14	21	4.8%	0.1%
peta-single	1617	0	50	20.3	10.6	12	19	28	1.9%	0.1%
peta-double	1411	0	50	21.0	10.3	13	21	28	1.3%	0.1%
Totale	28053

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48
6.3	8.5	10.6	13.2	16.5	19.2	22.7	25.6	29.9	32.4	36.0	39.9	43.5	47.8	50.6	54.6	57.9	62.0	65.2	68.9	71.6	74.1	77.4	80.1	82.7	83.8	85.5	86.3	88.2	89.5	90.7	92.2	93.0	93.8	94.9	96.4	97.0	97.5	97.9	98.3	98.4	99.0	99.2	99.3	99.5	99.5	99.6	99.6

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48
2.2	3.2	4.3	5.2	6.7	7.9	10.8	12.3	14.8	17.3	19.9	23.6	27.4	31.2	33.8	38.5	42.7	47.5	51.3	55.9	59.2	63.5	67.0	69.9	73.9	75.6	78.7	80.7	82.9	85.3	87.2	88.8	90.0	91.1	92.5	93.9	94.6	95.4	96.0	96.6	97.1	98.6	99.0	99.1	99.3	99.4	99.5	99.5

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48
10.1	13.1	16.7	19.8	23.0	26.9	31.1	35.6	40.3	44.6	48.6	52.3	56.0	59.8	63.8	66.9	69.9	73.2	76.0	78.3	80.7	82.8	84.7	86.2	87.9	89.5	90.5	91.7	92.8	93.8	94.7	95.4	96.0	96.6	97.1	97.6	98.1	98.5	98.7	98.8	98.9	99.1	99.6	99.7	99.8	99.8	99.8	99.8

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48
6.5	8.5	10.9	13.4	16.6	19.7	23.1	26.9	31.1	35.3	38.8	42.8	46.5	50.0	53.6	57.7	61.0	64.2	66.9	70.0	73.4	76.0	78.5	80.5	82.5	84.6	86.3	88.0	89.3	90.6	92.1	93.4	94.3	95.1	95.7	96.7	97.3	98.0	98.3	98.4	98.7	98.8	99.6	99.7	99.8	99.8	99.8	99.8

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46
12.1	16.7	21.8	26.3	30.3	36.1	41.0	46.2	51.6	55.8	60.3	64.4	68.0	71.4	74.4	77.2	79.9	82.3	84.2	86.1	87.9	89.6	90.8	92.0	93.2	94.6	95.7	96.6	97.3	97.8	98.5	98.5	99.0	99.2	99.3	99.5	99.7	99.7	99.8	99.8	99.8	99.9	99.9	99.9	99.9	100.0

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48
11.1	14.3	18.2	21.4	24.4	29.2	33.6	38.1	42.0	46.3	51.3	56.2	60.2	64.0	68.0	70.8	74.1	77.2	79.5	82.0	84.4	86.3	88.0	90.0	91.2	92.2	93.6	94.8	95.4	96.3	97.1	97.6	98.6	98.7	98.9	99.3	99.5	99.6	99.8	99.8	99.8	99.8	99.9	99.9	99.9	99.9	99.9	99.9

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48
4.0	5.2	6.5	8.5	12.6	14.1	19.3	23.2	26.9	30.6	34.7	39.2	44.4	48.2	52.2	56.4	60.3	64.1	67.6	70.7	73.9	77.0	79.5	82.2	84.6	86.6	88.7	90.5	91.7	92.9	94.2	95.2	96.2	96.8	97.8	98.2	98.6	98.8	99.2	99.4	99.7	99.8	99.9	99.9	99.9	99.9	100.0	100.0

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45
4.8	6.2	7.8	9.5	13.3	15.0	19.7	22.1	25.5	29.1	33.4	37.9	43.3	47.3	51.2	56.1	60.9	64.8	68.1	70.9	74.1	76.7	79.7	82.3	84.9	87.0	88.9	90.6	92.1	93.4	94.6	95.1	95.9	97.0	97.9	98.4	98.7	98.8	99.1	99.3	99.5	99.5	99.8	99.9	99.9

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48	49	50
1.9	2.3	3.5	4.1	5.3	7.2	8.5	10.9	13.9	16.8	18.9	22.8	26.1	28.3	31.5	36.2	38.9	42.1	46.3	50.5	54.1	58.0	60.4	63.2	66.8	69.9	72.4	75.0	78.6	80.4	82.7	84.5	85.7	87.3	88.9	90.7	92.0	93.2	94.2	94.7	96.0	96.7	97.6	98.4	98.6	98.7	99.0	99.3	99.8	99.9

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48	49	50
1.3	1.7	2.3	2.7	3.9	5.2	6.9	9.0	10.8	13.6	16.9	19.7	23.2	26.3	29.9	33.5	36.5	41.4	44.4	46.8	49.9	54.1	57.6	61.2	64.6	68.1	70.5	73.4	76.3	78.7	81.5	83.3	85.1	86.3	88.7	90.4	91.8	92.8	94.0	94.7	95.8	96.7	97.7	98.4	99.0	99.2	99.6	99.6	99.8	99.9

Categoria	studenti	femmine	maschi	squadre con dati mancanti	media componenti per squadra
kilo-single	1370	674 (49.2%)	696 (50.8%)	1481	0.92
kilo-double	3118	1571 (50.4%)	1547 (49.6%)	150	1.83
mega-single	5227	2511 (48.0%)	2716 (52.0%)	6160	0.88
mega-double	6147	2902 (47.2%)	3245 (52.8%)	712	1.67
giga-single	2453	1184 (48.3%)	1269 (51.7%)	2724	0.90
giga-double	3022	1469 (48.6%)	1553 (51.4%)	331	1.68
tera-single	2182	615 (28.2%)	1567 (71.8%)	2871	0.90
tera-double	2523	761 (30.2%)	1762 (69.8%)	264	1.68
peta-single	1283	292 (22.8%)	991 (77.2%)	1330	0.96
peta-double	2316	619 (26.7%)	1697 (73.3%)	198	1.72
Totale:	29641	12598 (42.5%)	17043 (57.5%)
Totale comprese squadre con dati mancanti	47517