In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
df_orig = pd.read_excel('KDB_Financial_Report_2022.xlsx',sheet_name='raw_data2')
In [3]:
df = df_orig.copy()

Data Cleaning / Feature Engineering¶

Extract month from Posting Date¶

In [6]:
df['Month'] = df['Posting Date'].apply(lambda x:x.month)

Extract Month name¶

In [8]:
month_dict = {1: 'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug',
             9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
In [9]:
df['Month Name'] = df['Month'].map(month_dict)

Create Expense categories¶

In [12]:
def exp_cat(exp):
    if 'social media' in exp.lower():
        return 'Social Media'
    elif 'outreach' in exp.lower():
        return 'Outreach'
    elif 'events' in exp.lower():
        return 'Events'
    elif 'advertising' in exp.lower():
        return 'Advertising'
    else:
        return 'Other'
In [13]:
df['Expense Category'] = df['Expense'].apply(exp_cat)

Categorize office Type¶

1. Ortho
2. Pedo
3. Mobile
4. DMMG
In [20]:
def office_cat(loc):
    if 'dmmg' in loc.lower():
        return 'DMMG'
    elif 'ortho' in loc.lower():
        return 'Ortho'
    elif 'pedo' in loc.lower():
        return 'Pedo'
    elif 'mobile' in loc.lower():
        return 'Mobile'
    else:
        return loc
In [21]:
df['Office Type'] = df['Location'].apply(office_cat)
In [23]:
df['Office Type'].value_counts()
Out[23]:
Pedo                      4711
Ortho                     3942
Mobile                      82
DMMG                        39
Burlington                   1
NC - K&A Greensboro          1
NC - K&A Randelman           1
NC - K&A Winston-Salem       1
NC - Thomasville             1
Name: Office Type, dtype: int64

Clean up outliers (domain knowledge)¶

Client confirmed these were all Pediatric offices (Pedo)
In [24]:
def office_cat_outliers(loc):
    outliers = ['Burlington','NC - K&A Greensboro','NC - K&A Randelman','NC - K&A Winston-Salem','NC - Thomasville']
    if loc in outliers:
        return 'Pedo'
    else:
        return loc
In [25]:
df['Office Type'] = df['Office Type'].apply(office_cat_outliers)
In [26]:
df['Office Type'].value_counts()
Out[26]:
Pedo      4716
Ortho     3942
Mobile      82
DMMG        39
Name: Office Type, dtype: int64

Extract state into State column¶

* outliers here are Burlington (NC), and DMMG (Corporate)
* the rest can be extracted as first two chars
In [30]:
list(df['Location'].unique())
Out[30]:
['AZ - Country Club Pedo',
 'AZ - Prince Pedo',
 'AZ - Valencia Pedo',
 'Burlington',
 'NC - K&A Greensboro',
 'NC - K&A Randelman',
 'NC - K&A Winston-Salem',
 'NC - Thomasville',
 'NC - Triad Orthodontics',
 'NC - Burlington Pedo',
 'NC - Nicholas Pedo',
 'NC - Randleman Pedo',
 'NC - Thomasville Pedo',
 'NC - Winston Salem Pedo',
 'MS - Gulfport Pedo',
 'TX - Mesa Pedo',
 'MS - Meridian Pedo',
 'AZ - Yuma Ortho',
 'TX - Dyer Pedo',
 'MS - Pascagola Pedo',
 'NC - Nicholas Ortho',
 'NC - Randleman Ortho',
 'NC - Thomasville Ortho',
 'NC - Winston Salem Ortho',
 'TX - Zaragosa Pedo',
 'MS - County Line Pedo',
 'NC - Wilmington Pedo',
 'MS - Woodrow Wilson Pedo',
 'MS - Tupelo Ortho',
 'MS - Tupelo Pedo',
 'MS - Horn Lake Pedo',
 'NC - Greenville Pedo',
 'MS - Southaven Ortho',
 'DMMG MARKETING ALLOCATED',
 'AZ - Arcadia Ortho',
 'AZ - Arcadia Pedo',
 'TX - Midland Pedo',
 'TX - Zaragosa Ortho',
 'TX - Dyer Ortho',
 'AZ - Greenway Pedo',
 'AZ - Greenway Ortho',
 'AZ - Tucson Mobile',
 'TX - Mesa Ortho',
 'DMMG - Accounting',
 'DMMG - Community Relation',
 'MS - Meridian Mobile',
 'AZ - Prince Ortho',
 'AZ - Country Club Ortho',
 'MS - Meridian Ortho',
 'MS - Coastal MS Mobile',
 'AZ - Valencia Ortho',
 'MS - CL/WW Mobile',
 'AZ - Flagstaff Pedo',
 'NC - Triad Mobile',
 'TX - Sunland Ortho',
 'MS - Ridgeland Ortho',
 'KS - Olathe Pedo',
 'DMMG - Staff Recruiting',
 'AZ - Yuma Mobile',
 'MS - Woodrow Wilson Ortho']
In [31]:
def get_state(loc):
    if 'DMMG' in loc:
        return 'DMMG'
    elif loc == 'Burlington':
        return 'NC'
    else:
        return loc[:2]
In [32]:
df['State'] = df['Location'].apply(get_state)
In [33]:
df['State'].value_counts()
Out[33]:
NC      2528
AZ      2450
MS      2026
TX      1661
KS        75
DMMG      39
Name: State, dtype: int64
In [34]:
df.head()
Out[34]:
Posting Date Employee Last name Employee First name Line Amount Location Expense Month Month Name Expense Category Office Type State
0 2022-01-02 Meda Luz (Lilly) 80.74 AZ - Country Club Pedo CommRel Social Media 1 Jan Social Media Pedo AZ
1 2022-01-02 Meda Luz (Lilly) 80.74 AZ - Prince Pedo CommRel Social Media 1 Jan Social Media Pedo AZ
2 2022-01-02 Meda Luz (Lilly) 80.74 AZ - Valencia Pedo CommRel Social Media 1 Jan Social Media Pedo AZ
3 2022-01-02 Stottler Jenna 6.67 Burlington CommRel Outreach 1 Jan Outreach Pedo NC
4 2022-01-02 Stottler Jenna 6.67 NC - K&A Greensboro CommRel Outreach 1 Jan Outreach Pedo NC
In [37]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8779 entries, 0 to 8778
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Posting Date         8779 non-null   datetime64[ns]
 1   Employee Last name   8779 non-null   object        
 2   Employee First name  8779 non-null   object        
 3   Line Amount          8779 non-null   float64       
 4   Location             8779 non-null   object        
 5   Expense              8779 non-null   object        
 6   Month                8779 non-null   int64         
 7   Month Name           8779 non-null   object        
 8   Expense Category     8779 non-null   object        
 9   Office Type          8779 non-null   object        
 10  State                8779 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(8)
memory usage: 754.6+ KB

Add client specified territory labels to group some offices together¶

1. Triad Ortho
2. Triad Pedo
3. Coast Pedo
4. Jackson Pedo
5. Jackson Ortho
6. El Paso Ortho
7. El Paso Pedo
8. Tuscon Ortho
9. Tuscon Pedo
10. DMMG
* The rest can be grouped by individual office name

Triad Ortho¶

In [43]:
triad_ortho = list(df[(df['State'] == 'NC') & (df['Office Type']=='Ortho')]['Location'].unique())

Triad Pedo¶

- doesn't include Wilmington or Greenville
In [46]:
triad_pedo = list(df[(df['State'] == 'NC') & (df['Office Type']=='Pedo')]['Location'].unique())
In [48]:
triad_pedo.pop(-1)
Out[48]:
'NC - Greenville Pedo'
In [51]:
triad_pedo.pop(-1)
Out[51]:
'NC - Wilmington Pedo'

Coast and Jackson (MS)¶

In [60]:
coast = ['MS - Pascagola Pedo','MS - Gulfport Pedo']
In [61]:
jackson_pedo = ['MS - Woodrow Wilson Pedo','MS - County Line Pedo']
jackson_ortho = ['MS - Woodrow Wilson Ortho']

El Paso¶

In [64]:
el_paso_ortho = ['TX - Sunland Ortho','TX - Dyer Ortho','TX - Zaragosa Ortho','TX - Mesa Ortho']
el_paso_pedo = ['TX - Dyer Pedo','TX - Mesa Pedo','TX - Zaragosa Pedo']

Tucson¶

In [67]:
tucson_ortho = ['AZ - Prince Ortho','AZ - Country Club Ortho','AZ - Valencia Ortho']
tucson_pedo = ['AZ - Prince Pedo','AZ - Country Club Pedo','AZ - Valencia Pedo']

Apply group/territory names as new column:¶

* and remove state inits + " - " from the rest
In [73]:
def add_groupname(loc):
    if 'dmmg' in loc.lower():
        return 'DMMG'
    
    elif loc in triad_ortho:
        return 'Triad Ortho'
    elif loc in triad_pedo:
        return 'Triad Pedo'
    elif loc in coast:
        return 'Coast Pedo'
    elif loc in jackson_pedo:
        return 'Jackson Pedo'
    elif loc in jackson_ortho:
        return 'Jackson Ortho'
    elif loc in el_paso_pedo:
        return 'El Paso Pedo'
    elif loc in el_paso_ortho:
        return 'El Paso Ortho'
    elif loc in tucson_ortho:
        return 'Tucson Ortho'
    elif loc in tucson_pedo:
        return 'Tucson Pedo'
    else:
        return loc[5:]
In [74]:
df['Group'] = df['Location'].apply(add_groupname)
In [75]:
df['Group'].value_counts()
Out[75]:
Triad Pedo           1257
Tucson Ortho         1045
Triad Ortho           923
El Paso Ortho         814
El Paso Pedo          787
Tucson Pedo           713
Jackson Pedo          302
Horn Lake Pedo        258
Tupelo Pedo           239
Tupelo Ortho          232
Ridgeland Ortho       225
Southaven Ortho       213
Coast Pedo            200
Arcadia Pedo          194
Greenville Pedo       194
Greenway Pedo         168
Meridian Ortho        157
Wilmington Pedo       147
Arcadia Ortho         146
Greenway Ortho        117
Meridian Pedo          87
Olathe Pedo            75
Jackson Ortho          63
Midland Pedo           60
DMMG                   39
Flagstaff Pedo         35
Meridian Mobile        22
Coastal MS Mobile      18
Tucson Mobile          17
CL/WW Mobile           10
Yuma Mobile             8
Triad Mobile            7
Yuma Ortho              7
Name: Group, dtype: int64

Export to Excel file for Viz in Power BI¶

In [78]:
df.to_excel('KDB_Financial_Cleaned.xlsx',sheet_name='clean_data',index=False)
In [ ]: