matplotlib stat generator

home
phpmyadmin remote server and config files
26. Feb 2019
printmargins
09. Jan 2019
some shell tips
03. Jan 2019
playlist-converter
20. Dec 2018
E-Liquid Calculator
21. Aug 2018
matplotlib stat generator
30. Jun 2018
network printer client side config
01. Nov 2016
Mozilla Dekstop Download Directories
15. May 2015
adjust cursor color for vim and urxvt
07. May 2015
css completion in vim
04. May 2015

2018-06-30 12:00:00, by unknown

This is a production notebook. It uses data gathered from a PBX and internal routing and will generate reports for TT, ACW and so on.
It is meant to be customizeable, so one can easily adjust

the time range that should be evaluated
different plots are available for
- hotline data
- agent data
- call distribution

The script can also easily iterate time ranges over the whole dataset and produce plots for each

This script makes little sense to use from a CLI, it's meant to run in an IPython Environment

script code

a markdown dump straight out of the notebook:

%load_ext autoreload
%autoreload 2

## Import necessary modules
import os,sys
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import date2num, AutoDateFormatter, AutoDateLocator, WeekdayLocator, MonthLocator, DayLocator, DateLocator, DateFormatter
from matplotlib.dates import MO, TU, WE, TH, FR, SA, SU
from matplotlib.ticker import AutoMinorLocator, AutoLocator, FormatStrFormatter, ScalarFormatter
import numpy as np
import datetime, calendar
from datetime import timedelta
import matplotlib.patches as mpatches
from itertools import tee
from traitlets import traitlets

sys.path.append(os.path.abspath('/home/keuch/gits/keuch/code_box/pyt/spreadsheetparsing/entwuerfe/xls_testruns/lib/'))
from ce_funclib import determine_kernzeit as dtkz
from ce_funclib import continuity_check


from ipywidgets import widgets, interact, interactive, fixed, interact_manual, Layout
from IPython.display import display
#%matplotlib inline
%matplotlib tk


## Import data frome pickle generated from muß ein file mit agentenstats sein
arcpth='/home/keuch/gits/keuch/code_box/pyt/spreadsheetparsing/test_stats/archiv/'

######## GET A LIST OF MATCHING .xls FILES FROM THE GIVEN DIRECTORY

def collectxlfiles(arcpath):
    xlfilelist=list()

    for xlfile in os.listdir(arcpath):
        if xlfile.startswith('CE_alle_Ag'):
            xlfileabs=os.path.join(arcpath,xlfile)
            xlfilelist.append(xlfileabs)
    return sorted(xlfilelist)

xlfilelist=collectxlfiles(arcpth)
#xlfilelist
#examplefile=xlfilelist[233]

###### TEST FOR DATA IN FILE, SORT OUT EMPTY FILES

## good dataframes do per definition not contain any zero values
## fill bad DFs with nan?

def filetoframe(exfile):
    exframe=pd.read_excel(exfile) # this is a regular pd.DataFrame
    datecell=exframe.iloc[0,1]
    sheet_datetime=pd.to_datetime(datecell,format='%d.%m %Y : %H')
    sheet_date=sheet_datetime.date()

    integritycheck=exframe.iloc[2,1] # files with data have "agenten" here, files with no calls have a 'nan'

    if integritycheck != 'Agenten':
        # if it's empty, keep date for filling it later
        print('Exception: ', end='')
        except_status='ex'

        usefulcols={0:'tstamp',1:'agent',3:'an',4:'be',22:'vl',24:'ht_float',29:'tt_float'} # map cols to decent names
        exframe=exframe.reindex(columns=sorted(usefulcols.keys()))
        exframe.rename(columns=usefulcols,inplace=True)        
        exframe=exframe[0:1] # strip text rows and the mangled sum row
        print(sheet_datetime)

        exframe['tstamp']=sheet_datetime
        exframe['date']=sheet_date
        exframe['agent']='nocalls_datum'
        exframe[['wd','ww','mm','yy']]=exframe['tstamp'].dt.strftime('%a,%W,%m,%Y').str.split(',',expand=True) # make ww,yy,mm,wd columns
        exframe['bz']=exframe['tstamp'].apply(dtkz)
        exframe['ort']=exframe['agent'].str[0] # split the identifier into useable columns
        exframe['id']='foobar' # split the identifier into useable columns

        # integers should be of appropriate datatype, we received them as strings
        # exframe[['vl','an','be','ww','mm','yy']]=exframe[['vl','an','be','ww','mm','yy']].astype(np.int64) #just for the beauty of it
        exframe.fillna(0, inplace=True) 
        exframe[['ww','mm','yy']]=exframe[['ww','mm','yy']].astype(np.int64) #just for the beauty of it
        #exframe.fillna(0, inplace=True) 
        return exframe,except_status

    else:
        except_status='reg'

        exframe.columns=range(0,30) # rename columns to a temporarily more readable format, fancy rename later
        usefulcols={0:'tstamp',1:'agent',3:'an',4:'be',22:'vl',24:'ht_float',29:'tt_float'} # map cols to decent names
        exframe=exframe[sorted(usefulcols.keys())] # skip cols and keep the ones we need
        exframe.rename(columns=usefulcols,inplace=True) # rename cols
        exframe=exframe[3:-1] # strip text rows and the mangled sum row
        exframe['tstamp']=pd.to_datetime(exframe['tstamp'],format=' %d.%m.%Y %H:%M ')
        exframe['date']=exframe['tstamp'].dt.date
        exframe[['wd','ww','mm','yy']]=exframe['tstamp'].dt.strftime('%a,%W,%m,%Y').str.split(',',expand=True) # make ww,yy,mm,wd columns
        exframe['bz']=exframe['tstamp'].apply(dtkz)

        exframe['ort']=exframe['agent'].str[0] # split the identifier into useable columns
        exframe['id']=exframe['agent'].str[-6:] # split the identifier into useable columns
        exframe['agent']=exframe['agent'].str[2:-7] # split the identifier into useable columns

        # integers should be of appropriate datatype, we received them as strings
        exframe[['vl','an','be','ww','mm','yy']]=exframe[['vl','an','be','ww','mm','yy']].astype(np.int64) #just for the beauty of it

        return exframe,except_status

framelist=list()
exceptionlist=list()
for xfile in xlfilelist:
    frame_from_file,except_status=filetoframe(xfile)
    if except_status=='ex':
        exceptionlist.append(xfile)
    framelist.append(frame_from_file)

Exception: 2017-04-17 00:00:00
Exception: 2017-05-14 00:00:00
Exception: 2017-11-19 00:00:00
Exception: 2017-12-03 00:00:00
Exception: 2017-12-10 00:00:00
Exception: 2018-03-04 00:00:00
Exception: 2018-03-11 00:00:00
Exception: 2018-04-08 00:00:00

#### produce a unified frame with all data and sort it by timstamp and agentname
bigframeii=pd.concat(framelist)
bigframeii.sort_values(['tstamp','agent'],inplace=True)
bigframeii.reset_index(drop=True,inplace=True) # there you go

# die exklusivlogins müssen zusammengelegt werden
unify_id={'gesinst':'995887','stanzju':'878457','papkeda':'891914'}
bigframeii.loc[bigframeii['id'] == unify_id['gesinst'],'agent'] = 'gesinst'
bigframeii.loc[bigframeii['id'] == unify_id['stanzju'],'agent'] = 'stanzju'
bigframeii.loc[bigframeii['id'] == unify_id['papkeda'],'agent'] = 'papkeda'

#### check, ob alle Daten(Tage) lückenlos sind
datenserie_uniq=bigframeii['date'].unique().tolist()
tage_bestand=len(datenserie_uniq)
tage_start=datenserie_uniq[0]
tage_ende=datenserie_uniq[-1:]

missing_dates=continuity_check(datenserie_uniq)
if not missing_dates:
    print('no dates are missing')
else:
    print('the following dates are not within the frame:')
    print(missing_dates)

no dates are missing

# get all agents available and create frames for kern and neben
allagents_list=sorted(bigframeii['agent'].unique())
allagents_list.extend(['Hagenow','Berlin','Alle'])
standorte=bigframeii.ort.unique().tolist()

we can't figure out individual calls anyway, since raw data calls have been grouped by hours already
so we can go on and group by days to figure out averages

bigframeii.head(2)
#Samstage = bigframeii.loc[(bigframeii['tstamp'].dt.dayofweek ==  5)

	tstamp	agent	an	be	vl	ht_float	tt_float	date	wd	ww	mm	yy	bz	ort	id
0	2017-03-04 08:00:00	beckfra	1.0	1.0	0.0	2.3667	2.1333	2017-03-04	Sat	9	3	2017	n	H	216694
1	2017-03-04 08:00:00	tetzlva	1.0	1.0	0.0	2.6833	2.6167	2017-03-04	Sat	9	3	2017	n	B	613887

Datum_MIN = bigframeii.tstamp.dt.date.min()
Datum_MAX = bigframeii.tstamp.dt.date.max()

Datum_VON = datetime.date(2017,5,1)   # YY,MM,DD
Datum_BIS = datetime.date(2018,10,1)

Kernzeit = (bigframeii['bz'] ==  'k')
Nebnzeit = (bigframeii['bz'] ==  'n')

Berlin = (bigframeii['ort'] ==  'B')
Hagenow = (bigframeii['ort'] ==  'H')

Wunschzeitraum = ((bigframeii.tstamp.dt.date >= Datum_VON) & (bigframeii.tstamp.dt.date <= Datum_BIS))

#bigframeii.loc[Wunschzeitraum & Kernzeit & Hagenow]   # SUPER!

kzdata=bigframeii.loc[Wunschzeitraum & Kernzeit].copy()

kzdata_reindex=kzdata.set_index(['tstamp','ort']).copy() # timestamps und Standort als neue Indizes
kzdata_byday=kzdata_reindex.groupby([pd.TimeGrouper('D', level='tstamp'), pd.Grouper(level='ort')]).sum() # nach Tagen gruppiert und alle Zahlen summiert

def plotzi(frame):

    aht,aacw,att,nzb,bg ="#003873","#EE0042","#899EB2","#C7798F","#FEFFE2"
    sonntage=WeekdayLocator(byweekday=(6), interval=2)
    datevon=frame.index.get_level_values('tstamp').min().strftime('%d.%m.%y')
    datebis=frame.index.get_level_values('tstamp').max().strftime('%d.%m.%y')


    #### ------------------------------------ ####

    fig=plt.figure(figsize=(12,5))

    ax=fig.add_subplot(111)
    ax.margins(0,0)
    ax.set_facecolor(bg)
    ax.set_xlabel('Kernzeit / ceDis von '+datevon+' bis '+datebis)
    ax.xaxis.set_major_locator(sonntage)
    ax.xaxis.set_major_formatter(DateFormatter('%d.%m.%y'))
    ax.set_ylabel('Calls angenommen')


    ber=frame.xs('B', level=1, drop_level=True)['an']
    hgw=frame.xs('H', level=1, drop_level=True)['an']

    b=ax.bar(ber.index,ber.values, label='calls', color=nzb, width=1)
    h=ax.bar(hgw.index,hgw.values, label='calls', color=aht, width=0.6)

    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, horizontalalignment='right', size=6 )

    bercalls=int(ber.sum())
    hgwcalls=int(hgw.sum())
    allecalls=bercalls+hgwcalls
    anteilhgw=str(    format((hgwcalls/allecalls)*100,'.2f')   )  
    anteilber=str(    format((bercalls/allecalls)*100,'.2f')   )



    callsBH = mpatches.Patch(color='000000', label='Calls Ber+Hgw: '+str(allecalls))
    callsB = mpatches.Patch(color=nzb, label='Calls Ber: '+str(bercalls) +' ('+anteilber+'%)'        )
    callsH = mpatches.Patch(color=aht, label='Calls Hgw: '+str(hgwcalls) +' ('+anteilhgw+'%)'        )

    ax.legend(handles=[callsBH,callsB,callsH],fontsize=8,ncol=2,loc='upper right',borderaxespad=-2,framealpha=0.99)

    plt.subplots_adjust(bottom=0.25)

plotzi(kzdata_byday)

example plots:

tags: