Statistiche gare Bebras italiano 2024¶

from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<input type="button" value="Clicca per vedere/nascondere il codice Python" onclick="code_toggle()">''')

import warnings
#warnings.filterwarnings('once')
warnings.filterwarnings('ignore')

Distribuzione dei punteggi¶

import pandas as pd
import numpy as np
import json, hashlib, urllib, os.path

pd.options.display.max_rows = None
pd.options.display.max_columns = None

CATS = ('kilo','mega','giga','tera','peta')
CATEGORIES = tuple(f'{c}' for c in CATS)
CAT_FILES = tuple(f'{c}' for c in CATS)

with open('secret.key') as k:
    key = k.readline().strip()

for i, k in enumerate(CAT_FILES):
    if not os.path.exists(f"results-{k}.json"):
        r = urllib.request.urlopen(f"https://bebras.it/api?key={key}&view=exams&edition=bebras_2024&events=0&test={121+i}")
        with open(f"results-{k}.json", "w") as tw:
            tw.writelines(r.read().decode('utf-8'))

score = []
for k in CATEGORIES:
    with open(f"results-{k}.json", "r") as t:
        j = json.load(t)
        score += j['exams']

scoredf = pd.DataFrame(score)

# L'orario va corretto per il fuso orario

scoredf['server_start'] = pd.to_datetime(scoredf['exam_date'].astype('int64') + 60*60, unit='s')
scoredf['orainizio'] = np.floor((scoredf['exam_date'].astype('int64') + 60*60) / (45*60)) # ore da 45', il tempo di gara
scoredf['punteggio'] = pd.to_numeric(scoredf['score'])
scoredf['punteggio_norm'] = scoredf['punteggio'].map(lambda x: x if x >= 0 else 0)
scoredf['anonid'] = scoredf['team_id'].map(lambda x: hashlib.md5(str(x).encode('utf8')).hexdigest())
scoredf['categoria'] = scoredf['category'].str.lower().astype(pd.api.types.CategoricalDtype(categories = CATEGORIES, ordered=True))

valid = scoredf[scoredf['exam_valid_score'] == 1]
valid.to_csv('anonris.csv', columns=['anonid', 'categoria', 'orainizio', 'punteggio', 'punteggio_norm', 'time'])

from IPython.display import display, Markdown

txt = '''<table>
<caption>Squadre partecipanti al Bebras 2024/25 con risultati correttamente registrati</caption>
<thead>
  <tr><th>Categoria</th>
  <th>squadre</th>
  <th> min </th>
  <th> max </th>
  <th> media </th>
  <th> std.dev. </th>
  <th>I quartile </th>
  <th>mediana </th>
  <th>III quartile</th>
  <th>Squadre al minimo</th>
  <th>Squadre al massimo</th>
</tr>
<tbody>
'''
for k in valid['categoria'].unique().sort_values():
    s = valid[valid['categoria'] == k]['punteggio_norm'].describe()
    top = valid[(valid['categoria'] == k) & (valid['punteggio_norm'] == int(s['max']))]
    bottom = valid[(valid['categoria'] == k) & (valid['punteggio_norm'] == int(s['min']))]
    txt += "<tr><th>{}</th><td>{}</td><td>{}</td><td>{}</td><td>{:.1f}</td>\
<td>{:3.1f}</td><td>{}</td><td>{}</td><td>{}</td><td>{:.1f}%</td><td>{:.1f}%</td></tr>".format(k, 
                                                              int(s['count']),
                                                              int(s['min']),
                                                              int(s['max']),
                                                              float(s['mean']),
                                                              float(s['std']),
                                                              int(s['25%']), 
                                                              int(s['50%']), 
                                                              int(s['75%']),
                                                              100*len(bottom)/float(s['count']),
                                                              100*len(top)/float(s['count']))
txt += '<tfoot><tr><th>Totale</th><td>{}</td></tr>'.format(valid['punteggio_norm'].count())
txt += '</table>'
display(Markdown(txt))

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

histograms = valid['punteggio_norm'].hist(by=valid['categoria'], bins=24, figsize=(10,16), layout=(5, 2))

Percentili per punteggio¶

for k in valid['categoria'].unique().sort_values():
    tot = float(valid[(valid['categoria'] == k)]['punteggio'].count())
    top = int(valid[(valid['categoria'] == k)]['punteggio'].max())
    pp = [100 * valid[(valid['categoria'] == k) & (valid['punteggio'] < i)]['punteggio'].count()/tot for i in range(1,top+1)]
    txt = '''<table>
    <caption>Percentili per la categoria {} (che percentuale di squadre si supera con un dato punteggio)</caption>
    <thead>'''.format(k)
    txt += ''.join(['<td>{}</td>'.format(i) for i in range(1,top+1)])
    txt += '<tbody>'
    txt += ''.join(['<td>{:.1f}</td>'.format(f) for f in pp])
    txt += '</table>'
    display(Markdown(txt))

Analisi delle risposte¶

rr = []
errors = 0
for r in valid.itertuples():
    for q in r.exam_data['questions']:
        try:
            t = dict((k, q[k]) for k in ('q_id','q_class','q_score','q_scoreMax','q_time'))
            t['anonid'] = r.anonid
            rr.append(t)
        except Exception as e:
            errors += 1
print(errors)

2513

quiz = pd.DataFrame(rr)

quiz['q_id'] = quiz['q_id'].str.split(pat='_', expand=True)[1]

MAPNAMES = {
    'Q01': 'La strada smarrita',
    'Q02': 'Vestiti',
    'Q03': 'Giocattoli in ordine',
    'Q04': 'La macchina del gelato',
    'Q05': 'Barca in porto',
    'Q06': 'Il pirata e il tesoro',
    'Q07': 'Bebramon',
    'Q08': 'Tastiera rotta',
    'Q10': 'Animali in viaggio',
    'Q11': "L'alfabeto dei Tuareg",
    'Q12': 'Palline binarie',
    'Q13': "Buon compleanno!",
    'Q15': 'Bilancia',
    'Q16': 'Videogioco',
    'Q17': 'Muretti a secco',
    'Q18': 'Passeggiata tra gli alberi',
    'Q19': 'Catene di parole',
    'Q20': 'La macchina dei palloncini',
    'Q21': 'Macchinine automatiche',
    'Q22': 'Animali in ordine',
    'Q23': 'Frecce che spariscono',
    'Q24': 'Arte programmata',
    'Q25': "Mappa del tesoro",
    'Q27': 'La mappa delle monete',
    'Q28': 'Robot da disegno',
    'Q29': 'Trasformazione di immagini',
    'Q30': 'I doni di Babbo Castoro',
    'Q31': 'Tastiera crittografica'
}

assert set(MAPNAMES.keys()) == set(quiz['q_id'].unique())

quiz = quiz.rename(columns={'q_time': 'time', 'q_score': 'score', 'q_scoreMax': 'score_max', 'q_class': 'cat'})

quiz['nome'] = quiz['q_id']
quiz['edizione'] = '2024'
quiz['completo'] = quiz['score'] == quiz['score_max']
quiz['parziale'] = (quiz['score'] > 0) & (quiz['score'] != quiz['score_max'])
quiz['penalizzato'] = quiz['score'] < 0
quiz['voto'] = quiz['score'] / quiz['score_max'].astype('float64')
quiz['minuti'] = quiz['time'].map(lambda x: float(x)/60. if float(x) >= 0 and float(x) <= 45*60 else np.nan)

quiz.to_csv('quiz.csv', columns=['anonid', 'cat', 'edizione', 'nome', 'score', 'score_max', 'time'])

vquiz = pd.merge(valid[['anonid', 'categoria', 'punteggio','punteggio_norm','orainizio','teacher_id','school_cap']], quiz, on='anonid')

fig, ax = plt.subplots(ncols=1, nrows=len(valid['categoria'].unique()), figsize=(16, 40))
for j, k in enumerate(valid['categoria'].unique().sort_values()):
    ax[j].set_ylim([0, 1])
    m = vquiz[vquiz['categoria'] == k].groupby('nome', sort=False)\
        [['completo','voto', 'parziale', 'penalizzato', 'minuti','score_max']].mean()
    m['vparziale'] = m['voto'] - m['completo']
    c = ax[j].bar(np.arange(m.index.size), m['completo'], color='blue')
    p = ax[j].bar(np.arange(m.index.size), m['parziale'], bottom=m['completo'], color='lightblue')
    ax[j].set_xticks(np.arange(m.index.size), map(lambda x: MAPNAMES[x], m.index.tolist()), rotation=45)
    ax[j].set_yticks(np.arange(0,1.2,.2), [f'{100*y:.0f}%' for y in np.arange(0,1.2,.2)])
    for i, y in enumerate(m['voto'].tolist()):
        ax[j].annotate(text=f"{m['minuti'].iloc[i]:.0f}'", xy=(i, .75*m['completo'].iloc[i]), color='white')
        ax[j].annotate(text=f"{m['score_max'].iloc[i]:.0f}", xy=(i-.15, .02), color='yellow', fontsize='x-large')
    ax[j].legend((c[0],p[0]), ('completo','parziale'), loc=(.92,.6))
    ax[j].set_title(f'{k}: tassi di soluzione (il numero in alto indica i minuti spesi in media sul quesito, \
il numero in basso il punteggio massimo ottenibile)')

fig.tight_layout()
fig.savefig('tassisol.png')

fig, ax = plt.subplots(ncols=1, nrows=len(valid['categoria'].unique()), figsize=(16, 40))
for j, k in enumerate(valid['categoria'].unique().sort_values()):
    ax[j].set_ylim([-1, 1])
    m = vquiz[vquiz['categoria'] == k].groupby('nome', sort=False)\
        [['completo','voto', 'parziale', 'penalizzato', 'minuti','score_max']].mean()
    m['vparziale'] = m['voto'] - m['completo']

    c = ax[j].bar(np.arange(m.index.size), m['voto'], color='green')
    z = ax[j].bar(np.arange(m.index.size), -m['penalizzato'], color='red')
    ax[j].set_yticks(np.arange(-1,1.2,.2), [f'{100*abs(y):.0f}%' for y in np.arange(-1,1.2,.2)])
 
    ax[j].set_xticks(np.arange(m.index.size), map(lambda x: MAPNAMES[x], m.index.tolist()), rotation=45)
    for i, y in enumerate(m['voto'].tolist()):
        ax[j].annotate(text=f'{m['score_max'].iloc[i]:.0f}', xy=(i, -.8), color='blue')
    
    ax[j].legend((c[0],z[0]), ('punteggio','penalità'), loc=(0.91,.725))
    ax[j].set_title(f'{k}: percentuale di punteggio attribuito in media, in rosso la percentuale di penalizzati (il numero in basso è il punteggio massimo)')

fig.tight_layout()
fig.savefig('punti.png')

Analisi delle squadre¶

members = []
for r in valid.itertuples():
    if r.team_composition and 'members' in r.team_composition:
        for m in r.team_composition['members']:
            m['categoria'] = r.category.lower()
            m['anonid'] = hashlib.md5(str(r.team_id).encode('utf8')).hexdigest()
            members.append(m)

pupils = pd.DataFrame(members)
pupils['genere'] = pupils['sex'].map(lambda x: x if x != '-' else np.nan)
pupils['categoria'] = pupils['categoria'].astype(pd.api.types.CategoricalDtype(categories = CATEGORIES, ordered=True))

gender = pupils[pupils['genere'].notnull()].groupby(['categoria', 'genere']).count()
txt = '''<table><caption>Studenti partecipanti al Bebras 2024 con risultati validi 
(i dati dipendono dalla corretta compilazione dei profili delle squadre)</caption>
<thead>
  <tr><th>Categoria</th>
  <th>studenti</th>
  <th>femmine</th>
  <th>maschi</th>
  <th>squadre con dati mancanti</th>
  <th>media componenti per squadra</th>
  </tr>
<tbody>
'''
notempty = pupils[pupils['genere'].isin(['m', 'f'])].groupby('categoria')['anonid'].nunique()
empty = pupils[pupils['genere'].isnull()].groupby('categoria')['anonid'].nunique()

totf = 0
totm = 0
tot = 0
for k in pupils['categoria'].unique().sort_values():
    f = gender.loc[(k,'f')]['class']
    totf += f
    m = gender.loc[(k,'m')]['class']
    totm += m
    tot += f + m + empty[k]*(2 if '-double' in k else 1)
    
    txt += f'<tr><th>{k}</th><td>{f+m}</td><td>{f} ({100*float(f)/float(f+m):.1f}%)</td><td>{m} ({100*float(m)/float(f+m):.1f}%)</td><td>{empty[k]}</td><td>{float(f+m) / float(notempty[k]):.2f}</td></tr>'

txt += f'<tr><th>Totale:</th><td>{totf+totm}</td><td>{totf} ({100*float(totf)/float(totf+totm):.1f}%)</td><td>{totm} ({100*float(totm)/float(totf+totm):.1f}%)</td></tr>'
txt += f'<tr><th>Totale comprese squadre con dati mancanti</th><td> ≧ {tot}</td></tr>'
txt += '</table>'
display(Markdown(txt))

compositions = pupils[pupils['genere'].isin(['m', 'f'])].groupby('anonid')['genere'].agg(lambda xx: ''.join(sorted(xx.sum())))

gquiz = pd.merge(vquiz, compositions, on='anonid')

gquiz['n_m'] = gquiz['genere'].str.count('m')
gquiz['n_f'] = gquiz['genere'].str.count('f')
gquiz['n'] = gquiz['n_m'] + gquiz['n_f']

gquiz.to_csv('mf.csv')

I nomi delle squadre più comuni¶

import re
from collections import Counter

notwanted = re.compile('^0[0-9]+$|^[0-9][a-zA-Z0-9_]|^the$|^and$|^classe$|^squadra$|^gruppo$|^team$|^i+$|^iv$|^[a-zA-Z0-9_]$|^prima$|^seconda$\
|^terza$|^quarta$|^quinta$|^squadra|^$')

names = scoredf['team_name'].str.strip().str.lower().tolist()
oknames = filter(lambda w: not notwanted.match(w), names)

c = Counter(oknames)

c.most_common(30)

[('i matematici', 29),
 ('blu', 29),
 ('viola', 22),
 ('giallo', 22),
 ('verde', 22),
 ('rosso', 21),
 ('le girls', 20),
 ('saturno', 17),
 ('marte', 16),
 ('giove', 16),
 ('venere', 16),
 ('mercurio', 15),
 ('azzurro', 15),
 ('le baddie', 15),
 ('leoni', 14),
 ('i castori', 14),
 ('rosa', 14),
 ('i tre moschettieri', 14),
 ('arancione', 13),
 ('le stelle', 13),
 ('le winx', 13),
 ('i senza nome', 12),
 ('bianco', 12),
 ('i capibara', 12),
 ('i fantastici tre', 11),
 ('le tigri', 11),
 ('i fantastici 3', 11),
 ('gli invincibili', 10),
 ('terra', 10),
 ('gli hacker', 10)]

plt.axis('off')
os = scoredf['operating_system'].value_counts().plot.pie(autopct='%.1f', radius=1.22,
                                                    explode=[.06*i*i for i in range(len(scoredf['operating_system'].unique()))],
                                                    figsize=(5,5), title='Sistemi operativi utilizzati')

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48
0.8	1.1	2.0	2.9	4.0	5.9	7.5	10.1	12.1	14.8	17.2	19.7	22.9	26.2	29.2	32.5	36.1	39.6	43.5	46.9	50.7	54.1	56.8	60.9	63.9	67.2	69.6	72.3	75.6	78.5	81.0	83.3	85.4	87.5	89.0	90.6	92.2	93.5	94.7	95.7	96.6	97.1	98.2	98.3	99.2	99.2	99.5	99.6

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48
0.6	1.0	1.5	2.1	2.9	3.7	4.9	6.2	7.6	9.1	11.3	13.6	16.0	18.7	21.4	24.1	27.2	30.4	33.7	37.3	40.9	44.4	48.2	51.9	55.5	59.0	62.6	66.2	69.2	72.6	75.5	78.3	80.8	83.2	85.4	87.4	89.4	91.3	92.8	94.3	95.4	96.2	97.2	98.0	98.4	99.0	99.4	99.6

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48
0.6	0.8	1.1	1.9	2.4	3.5	4.4	5.9	7.4	9.7	11.8	14.4	17.2	20.7	24.2	28.1	32.7	36.6	41.7	45.2	50.2	53.3	57.7	61.1	65.5	68.6	72.8	75.1	78.3	80.5	83.3	84.9	87.1	88.3	90.2	91.2	92.7	93.6	94.9	95.7	96.7	97.3	98.1	98.5	98.6	99.2	99.5	99.6

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48
0.5	0.8	1.8	2.7	4.4	5.8	7.8	10.9	14.5	17.8	22.6	27.2	32.5	37.2	42.5	46.9	51.8	56.3	61.4	64.9	68.8	72.2	75.4	78.6	81.1	83.5	86.5	88.6	90.2	91.2	92.4	93.1	93.9	94.6	95.4	96.3	97.1	97.7	97.9	98.2	98.4	98.7	99.1	99.3	99.4	99.6	99.7	99.7

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48
1.9	2.0	5.0	5.9	10.9	13.4	19.1	23.1	28.7	32.7	39.0	44.0	50.3	54.0	59.2	63.5	67.9	72.3	76.0	79.7	82.7	85.1	88.1	90.0	91.7	93.1	95.0	95.9	96.9	97.3	97.7	98.0	98.0	98.4	98.6	98.6	98.8	99.2	99.4	99.4	99.4	99.5	99.5	99.6	99.6	99.6	99.8	99.8

Categoria	squadre	min	max	media	std.dev.	I quartile	mediana	III quartile	Squadre al minimo	Squadre al massimo
kilo	3463	0	48	20.9	10.3	13	20	28	0.8%	0.4%
mega	10697	0	48	23.1	10.1	16	23	30	0.6%	0.4%
giga	5016	0	48	21.4	9.5	15	20	27	0.6%	0.4%
tera	2654	0	48	17.2	8.6	11	16	22	0.5%	0.3%
peta	1883	0	48	13.5	7.7	8	12	18	1.9%	0.2%
Totale	23713

Categoria	studenti	femmine	maschi	squadre con dati mancanti	media componenti per squadra
kilo	7767	3737 (48.1%)	4030 (51.9%)	1021	2.74
mega	21544	10338 (48.0%)	11206 (52.0%)	3593	2.70
giga	9936	4849 (48.8%)	5087 (51.2%)	1760	2.68
tera	5114	1378 (26.9%)	3736 (73.1%)	1162	2.64
peta	3716	852 (22.9%)	2864 (77.1%)	759	2.55
Totale:	48077	21154 (44.0%)	26923 (56.0%)
Totale comprese squadre con dati mancanti	≧ 56372