Statistiche gare Bebras italiano 2017¶

from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<input type="button" value="Clicca per vedere/nascondere il codice Python" onclick="code_toggle()">''')

Distribuzione dei punteggi¶

import pandas as pd
import json, hashlib, urllib, os.path

pd.options.display.max_rows = None
pd.options.display.max_columns = None

CATS = ('kilo', 'mega', 'giga', 'tera', 'peta')

with open('secret.key') as k:
    key = k.readline().strip()

for i, k in enumerate(CATS):
    if not os.path.exists("results-{}.json".format(k)):
        r = urllib.request.urlopen("https://bebras.it/api?key={}&view=exams&test={}".format(key,50+i))
        with open("results-{}.json".format(k), "w") as tw:
            tw.writelines(r.read().decode('utf-8'))

score = []
for k in CATS:
    with open("results-{}.json".format(k), "r") as t:
        j = json.load(t)
        score += j['exams']

scoredf = pd.DataFrame(score)

# L'orario va corretto per il fuso orario
scoredf['server_start'] = pd.to_datetime(scoredf['exam_date'].astype('int64') + 60*60, unit='s')
scoredf['orainizio'] = pd.np.floor((scoredf['exam_date'].astype('int64') + 60*60) / (45*60)) # ore da 45', il tempo di gara
scoredf['punteggio'] = pd.to_numeric(scoredf['score'])
scoredf['anonid'] = scoredf['team_id'].map(lambda x: hashlib.md5(str(x).encode('utf8')).hexdigest())
scoredf['categoria'] = scoredf['category'].str.lower().astype("category", categories=CATS, ordered=True)

valid = scoredf[scoredf['exam_valid_score'] == 1]
valid.to_csv('anonris.csv', columns=['anonid', 'categoria', 'orainizio', 'punteggio', 'time'])

from IPython.display import display, Markdown

txt = '''<table>
<caption>Squadre partecipanti al Bebras 2017/18 con risultati validi, 
cioè ritenuti confrontabili con gli altri perché privi di anomalie tecniche o organizzative</caption>
<thead>
  <tr><th>Categoria</th>
  <th>squadre</th>
  <th> min </th>
  <th> max </th>
  <th> media </th>
  <th> std.dev. </th>
  <th>I quartile </th>
  <th>mediana </th>
  <th>III quartile</th>
  <th>Squadre al minimo</th>
  <th>Squadre al massimo</th>
</tr>
<tbody>
'''
for k in valid['categoria'].unique():
    s = valid[valid['categoria'] == k]['punteggio'].describe()
    top = valid[(valid['categoria'] == k) & (valid['score'] == int(s['max']))]
    bottom = valid[(valid['categoria'] == k) & (valid['score'] == int(s['min']))]
    txt += "<tr><th>{}</th><td>{}</td><td>{}</td><td>{}</td><td>{:.1f}</td>\
<td>{:3.1f}</td><td>{}</td><td>{}</td><td>{}</td><td>{:.1f}%</td><td>{:.1f}%</td></tr>".format(k, 
                                                              int(s['count']),
                                                              int(s['min']),
                                                              int(s['max']),
                                                              float(s['mean']),
                                                              float(s['std']),
                                                              int(s['25%']), 
                                                              int(s['50%']), 
                                                              int(s['75%']),
                                                              100*len(bottom)/float(s['count']),
                                                              100*len(top)/float(s['count']))
txt += '<tfoot><tr><th>Totale</th><td>{}</td></tr>'.format(valid['punteggio'].count())
txt += '</table>'
display(Markdown(txt))

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

histograms = valid['punteggio'].hist(by=valid['categoria'], bins=30, figsize=(10,8))

Percentili per punteggio¶

for k in valid['categoria'].unique():
    tot = float(valid[(valid['categoria'] == k)]['punteggio'].count())
    top = int(valid[(valid['categoria'] == k)]['punteggio'].max())
    pp = [100 * valid[(valid['categoria'] == k) & (valid['punteggio'] < i)]['punteggio'].count()/tot for i in range(1,top+1)]
    txt = '''<table>
    <caption>Percentili per la categoria {} (che percentuale di squadre si supera con un dato punteggio)</caption>
    <thead>'''.format(k)
    txt += ''.join(['<td>{}</td>'.format(i) for i in range(1,top+1)])
    txt += '<tbody>'
    txt += ''.join(['<td>{:.1f}</td>'.format(f) for f in pp])
    txt += '</table>'
    display(Markdown(txt))

Analisi delle risposte¶

rr = []
for r in valid.itertuples():
    for q in r.exam_data['questions']:
        t = dict((k, q[k]) for k in ('q_id','q_class','q_score','q_scoreMax','q_time'))
        t['anonid'] = r.anonid
        rr.append(t)

quiz = pd.DataFrame(rr)

MAPBEBRAS = dict((x.split('_')[-1], x.split('_')[1]) for x in list(quiz['q_id'].unique()))

MAPNAMES = {
    'Acquapark': 'Parco acquatico',
    'AllHome': 'Tutti a casa',
     'Balls': 'Palline rotolanti',
 'BeaverTournament': 'Il torneo di pallavolo',
 'Beaverhotel': 'Pulizie',
 'Bebragram': 'Bebragram',
 'CamminoDiArabot': 'Il cammino di Ara-bot',
 'Chain': 'Festoni',
 'ColorPaths': 'Sentieri colorati',
 'Crossbreeds': 'Incroci tra animali',
 'DanceOff': 'Balletto interattivo',
 'DigitRecognition': 'Riconoscimento di cifre',
 'Files': 'Cerca i documenti',
 'FindTheGap': 'Attraverso il passaggio',
 'GiveMeASmile': 'Dammi un sorriso',
 'IconImageCompression': 'Compressione di figure',
 'Intrusion': 'Intrusione',
 'Music': 'Ritornelli',
 'Ninja': 'I soprannomi dei ninja',
 'OfficeLights': "Luci nell'ufficio",
 'PaintingWallPaper': 'La tapezzeria',
 'ParkingLot': 'Il parcheggio',
 'Railroad': 'Ferrovia',
 'Robot': 'Robot',
 'Skyscraper': 'Il grattacielo luminoso',
 'ToyStorage': 'Giocattoli in ordine',
 'Worm': 'Un verme affamato',
 'blocks': 'Un mondo di blocchi',
 'pizzeria': 'Pizze e calzoni',
 'toys': 'Regali di Natale'}

quiz = quiz.rename(columns={'q_time': 'time', 'q_score': 'score', 'q_scoreMax': 'score_max', 'q_class': 'cat'})

quiz['nome'] = quiz['q_id'].str.extract('\d+_.+_(.+)', expand=False)
quiz['edizione'] = quiz['q_id'].str.extract('(\d+)_.+_.+', expand=False)
quiz['bebras'] = quiz['q_id'].str.extract('\d+_(.+)_.+', expand=False)
quiz['completo'] = quiz['score'] == quiz['score_max']
quiz['parziale'] = (quiz['score'] > 0) & (quiz['score'] != quiz['score_max'])
quiz['voto'] = quiz['score'] / quiz['score_max'].astype('float64')
quiz['minuti'] = quiz['time'].map(lambda x: float(x)/60. if float(x) >= 0 else pd.np.NaN)

quiz.to_csv('quiz.csv', columns=['anonid', 'cat', 'edizione', 'nome', 'bebras', 'score', 'score_max', 'time'])

vquiz = pd.merge(valid[['anonid', 'categoria', 'punteggio','orainizio','teacher_id','school_cap']], quiz, on='anonid')

plt.figure(figsize=(16,20))

def bname(n):
    if n in MAPBEBRAS and n in MAPNAMES:
        return '{} ({})'.format(MAPNAMES[n], MAPBEBRAS[n])
    else:
        return n

for j, k in enumerate(valid['categoria'].unique()):
    plt.subplot(5,1, j+1)
    plt.ylim(0,1.2)
    m = vquiz[vquiz['categoria'] == k].groupby('nome', 
                                             sort=False)[['completo','voto', 'parziale', 'minuti','score_max']].mean()
    m['vparziale'] = m['voto'] - m['completo']

    c = plt.bar(pd.np.arange(m.index.size), m['completo'], color='blue')
    p = plt.bar(pd.np.arange(m.index.size), m['parziale'], bottom=m['completo'], color='lightblue')
    plt.xticks(pd.np.arange(m.index.size) + 0.4, map(bname, m.index.tolist()), rotation=90)
    for i, y in enumerate(m['voto'].tolist()):
        plt.annotate(s='{:.0f}'.format(m['minuti'].iloc[i]), xy=(i+0.3, y+.08))
        plt.annotate(s='{}'.format(m['score_max'].iloc[i]), xy=(i+0.3, .02), color='yellow')
    plt.legend((c[0],p[0]), ('completo','parziale'))
    plt.title('{}: tassi di soluzione (il numero in alto indica i minuti spesi in media sul quesito, \
il numero in basso il punteggio massimo ottenibile)'.format(k))

plt.tight_layout()

plt.figure(figsize=(16,20))

for j, k in enumerate(valid['categoria'].unique()):
    plt.subplot(5,1, j+1)
    plt.ylim(0,1.2)
    m = vquiz[vquiz['categoria'] == k].groupby('nome', 
                                             sort=False)[['completo','voto', 'parziale', 'minuti','score_max']].mean()
    m['vparziale'] = m['voto'] - m['completo']

    c = plt.bar(pd.np.arange(m.index.size), m['voto'], color='green')
    plt.xticks(pd.np.arange(m.index.size) + 0.4, map(bname, m.index.tolist()), rotation=90)
    for i, y in enumerate(m['voto'].tolist()):
        plt.annotate(s='{}'.format(m['score_max'].iloc[i]), xy=(i+0.3, y+.08), color='red')
    
    plt.title('{}: percentuale di punteggio attribuito in media (in rosso il punteggio massimo ottenibile)'.format(k))

plt.tight_layout()

Analisi delle squadre¶

members = []
for r in valid.itertuples():
    if r.team_composition and 'members' in r.team_composition:
        for m in r.team_composition['members']:
            m['categoria'] = r.category.lower()
            members.append(m)

pupils = pd.DataFrame(members)
pupils['genere'] = pupils['sex'].map(lambda x: x if x != '-' else pd.np.NaN)
pupils['categoria'] = pupils['categoria'].astype("category", categories=CATS, ordered=True)

gender = pupils[(pupils['name'] != '') | pupils['genere'].notnull()].groupby(['categoria', 'genere']).count()
txt = '''<table><caption>Studenti partecipanti al Bebras 2017 con risultati validi 
(i dati dipendono dalla corretta compilazione dei profili delle squadre)</caption>
<thead>
  <tr><th>Categoria</th>
  <th>studenti</th>
  <th>femmine</th>
  <th>maschi</th>
  <th>squadre con dati mancanti</th>
  <th>media component per squadra</th>
  </tr>
<tbody>
'''

totf = 0
totm = 0
for k in pupils['categoria'].unique():
    f = gender.loc[(k,'f')]['class']
    totf += f
    m = gender.loc[(k,'m')]['class']
    totm += m
    s = valid.groupby('categoria').count().loc[k]['login']
    empty = len(valid[(valid['categoria'] == k) &(valid['team_composition'] == False)])
    txt += '<tr><th>{}</th><td>{}</td><td>{} ({:.1f}%)</td><td>{} ({:.1f}%)</td><td>{}</td><td>{:.1f}</td></tr>'.format(
        k, f+m, f, 100*float(f)/float(f+m), m, 100*float(m)/float(f+m), empty, float(m+s) / float(s - empty)
    )
txt += '<tr><th>Totale:</th><td>{}</td><td>{} ({:.1f}%)</td><td>{} ({:.1f}%)</td></tr>'.format(totf+totm, 
                                                                           totf, 100*float(totf)/float(totf+totm), 
                                                                           totm, 100*float(totm)/float(totf+totm))    
txt += '</table>'
display(Markdown(txt))

I nomi delle squadre più comuni¶

import re
from collections import Counter

notwanted = re.compile('^0\d+$|^\d\w|^the$|^and$|^classe$|^squadra$|^gruppo$|^team$|^i+$|^iv$|^\w$|^prima$|^seconda$\
|^terza$|^quarta$|^quinta$')

names = scoredf['team_name'].str.strip().str.lower().tolist()
oknames = filter(lambda w: not notwanted.match(w), names)

c = Counter(oknames)

c.most_common(30)

[('i fantastici 4', 33),
 ('i matematici', 31),
 ('leoni', 23),
 ('gli informatici', 21),
 ('blu', 18),
 ('aquile', 16),
 ('tigri', 16),
 ('gli invincibili', 16),
 ('the best', 15),
 ('marte', 14),
 ('lupi', 13),
 ('ghepardi', 12),
 ('pantere', 12),
 ('i castori', 11),
 ('verdi', 10),
 ('matematici', 10),
 ('i fantastici quattro', 10),
 ('i cervelloni', 10),
 ('nettuno', 10),
 ('i mitici', 10),
 ('venere', 10),
 ('i leoni', 10),
 ('informatici', 10),
 ('squali', 10),
 ('saturno', 10),
 ('i tecnologici', 10),
 ('i campioni', 9),
 ('delfini', 9),
 ('rossi', 9),
 ('giove', 9)]

plt.axis('off')
os = scoredf['operating_system'].value_counts().plot.pie(autopct='%.1f', radius=1.22,
                                                    explode=[.06*i*i for i in range(len(scoredf['operating_system'].unique()))],
                                                    figsize=(5,5), title='Sistemi operativi utilizzati')

Categoria	squadre	min	max	media	std.dev.	I quartile	mediana	III quartile	Squadre al minimo	Squadre al massimo
kilo	3351	0	39	26.2	7.5	21	27	31	0.4%	5.5%
mega	3746	0	38	23.8	8.4	18	25	30	0.3%	6.9%
giga	1775	0	37	20.7	8.2	15	21	27	0.6%	3.6%
tera	1897	0	37	21.4	8.8	15	21	28	0.5%	3.6%
peta	1348	0	37	22.0	8.7	16	23	29	0.7%	3.5%
Totale	12117

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39
0.4	0.4	0.4	0.7	0.7	0.8	0.9	1.2	1.5	1.8	2.3	2.9	5.0	5.7	6.9	8.7	12.0	13.0	14.6	18.9	23.4	25.8	27.6	35.4	39.9	43.2	47.1	56.1	60.1	61.9	69.5	75.6	79.3	80.3	87.0	90.0	93.8	94.5	94.5

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38
0.3	0.3	0.3	0.6	1.2	1.4	1.5	3.2	3.7	3.9	6.0	8.8	9.2	10.4	15.6	17.7	18.5	23.0	30.4	31.8	33.3	39.6	46.0	48.4	49.9	56.1	61.7	64.7	66.9	72.4	76.6	79.2	81.3	84.1	89.8	91.5	93.1	93.1

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37
0.6	0.6	0.7	1.3	1.6	2.6	3.6	4.5	6.9	8.9	11.0	14.8	16.7	20.6	24.8	27.7	32.9	37.0	40.6	45.2	49.6	54.9	58.6	63.0	66.9	70.6	74.7	77.1	81.9	84.3	86.5	88.4	91.0	93.9	95.6	96.4	96.4

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37
0.5	0.6	0.7	1.4	2.3	3.3	4.2	5.8	7.6	9.9	12.3	14.7	17.9	20.8	24.2	27.8	31.6	35.5	39.6	42.6	45.7	50.0	53.5	56.9	60.9	65.1	68.6	71.6	75.3	78.3	81.1	84.3	88.1	91.0	93.1	94.1	96.4

1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37
0.7	0.7	1.0	1.4	2.0	2.6	4.1	5.6	7.0	9.1	10.5	12.7	15.9	18.1	21.6	24.6	28.5	32.1	34.9	38.3	42.1	46.1	49.5	54.0	57.9	62.5	65.8	69.6	74.5	77.4	80.0	83.0	88.9	91.1	92.9	93.9	96.5

Categoria	studenti	femmine	maschi	squadre con dati mancanti	media component per squadra
kilo	11654	5629 (48.3%)	6025 (51.7%)	0	2.8
mega	12855	6068 (47.2%)	6787 (52.8%)	0	2.8
giga	6179	2857 (46.2%)	3322 (53.8%)	0	2.9
tera	6230	1853 (29.7%)	4377 (70.3%)	0	3.3
peta	4146	1137 (27.4%)	3009 (72.6%)	0	3.2
Totale:	41064	17544 (42.7%)	23520 (57.3%)