import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df_orig = pd.read_excel('KDB_Financial_Report_2022.xlsx',sheet_name='raw_data2')
df = df_orig.copy()
df['Month'] = df['Posting Date'].apply(lambda x:x.month)
month_dict = {1: 'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug',
9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
df['Month Name'] = df['Month'].map(month_dict)
def exp_cat(exp):
if 'social media' in exp.lower():
return 'Social Media'
elif 'outreach' in exp.lower():
return 'Outreach'
elif 'events' in exp.lower():
return 'Events'
elif 'advertising' in exp.lower():
return 'Advertising'
else:
return 'Other'
df['Expense Category'] = df['Expense'].apply(exp_cat)
1. Ortho
2. Pedo
3. Mobile
4. DMMG
def office_cat(loc):
if 'dmmg' in loc.lower():
return 'DMMG'
elif 'ortho' in loc.lower():
return 'Ortho'
elif 'pedo' in loc.lower():
return 'Pedo'
elif 'mobile' in loc.lower():
return 'Mobile'
else:
return loc
df['Office Type'] = df['Location'].apply(office_cat)
df['Office Type'].value_counts()
Pedo 4711 Ortho 3942 Mobile 82 DMMG 39 Burlington 1 NC - K&A Greensboro 1 NC - K&A Randelman 1 NC - K&A Winston-Salem 1 NC - Thomasville 1 Name: Office Type, dtype: int64
Client confirmed these were all Pediatric offices (Pedo)
def office_cat_outliers(loc):
outliers = ['Burlington','NC - K&A Greensboro','NC - K&A Randelman','NC - K&A Winston-Salem','NC - Thomasville']
if loc in outliers:
return 'Pedo'
else:
return loc
df['Office Type'] = df['Office Type'].apply(office_cat_outliers)
df['Office Type'].value_counts()
Pedo 4716 Ortho 3942 Mobile 82 DMMG 39 Name: Office Type, dtype: int64
* outliers here are Burlington (NC), and DMMG (Corporate)
* the rest can be extracted as first two chars
list(df['Location'].unique())
['AZ - Country Club Pedo', 'AZ - Prince Pedo', 'AZ - Valencia Pedo', 'Burlington', 'NC - K&A Greensboro', 'NC - K&A Randelman', 'NC - K&A Winston-Salem', 'NC - Thomasville', 'NC - Triad Orthodontics', 'NC - Burlington Pedo', 'NC - Nicholas Pedo', 'NC - Randleman Pedo', 'NC - Thomasville Pedo', 'NC - Winston Salem Pedo', 'MS - Gulfport Pedo', 'TX - Mesa Pedo', 'MS - Meridian Pedo', 'AZ - Yuma Ortho', 'TX - Dyer Pedo', 'MS - Pascagola Pedo', 'NC - Nicholas Ortho', 'NC - Randleman Ortho', 'NC - Thomasville Ortho', 'NC - Winston Salem Ortho', 'TX - Zaragosa Pedo', 'MS - County Line Pedo', 'NC - Wilmington Pedo', 'MS - Woodrow Wilson Pedo', 'MS - Tupelo Ortho', 'MS - Tupelo Pedo', 'MS - Horn Lake Pedo', 'NC - Greenville Pedo', 'MS - Southaven Ortho', 'DMMG MARKETING ALLOCATED', 'AZ - Arcadia Ortho', 'AZ - Arcadia Pedo', 'TX - Midland Pedo', 'TX - Zaragosa Ortho', 'TX - Dyer Ortho', 'AZ - Greenway Pedo', 'AZ - Greenway Ortho', 'AZ - Tucson Mobile', 'TX - Mesa Ortho', 'DMMG - Accounting', 'DMMG - Community Relation', 'MS - Meridian Mobile', 'AZ - Prince Ortho', 'AZ - Country Club Ortho', 'MS - Meridian Ortho', 'MS - Coastal MS Mobile', 'AZ - Valencia Ortho', 'MS - CL/WW Mobile', 'AZ - Flagstaff Pedo', 'NC - Triad Mobile', 'TX - Sunland Ortho', 'MS - Ridgeland Ortho', 'KS - Olathe Pedo', 'DMMG - Staff Recruiting', 'AZ - Yuma Mobile', 'MS - Woodrow Wilson Ortho']
def get_state(loc):
if 'DMMG' in loc:
return 'DMMG'
elif loc == 'Burlington':
return 'NC'
else:
return loc[:2]
df['State'] = df['Location'].apply(get_state)
df['State'].value_counts()
NC 2528 AZ 2450 MS 2026 TX 1661 KS 75 DMMG 39 Name: State, dtype: int64
df.head()
Posting Date | Employee Last name | Employee First name | Line Amount | Location | Expense | Month | Month Name | Expense Category | Office Type | State | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2022-01-02 | Meda | Luz (Lilly) | 80.74 | AZ - Country Club Pedo | CommRel Social Media | 1 | Jan | Social Media | Pedo | AZ |
1 | 2022-01-02 | Meda | Luz (Lilly) | 80.74 | AZ - Prince Pedo | CommRel Social Media | 1 | Jan | Social Media | Pedo | AZ |
2 | 2022-01-02 | Meda | Luz (Lilly) | 80.74 | AZ - Valencia Pedo | CommRel Social Media | 1 | Jan | Social Media | Pedo | AZ |
3 | 2022-01-02 | Stottler | Jenna | 6.67 | Burlington | CommRel Outreach | 1 | Jan | Outreach | Pedo | NC |
4 | 2022-01-02 | Stottler | Jenna | 6.67 | NC - K&A Greensboro | CommRel Outreach | 1 | Jan | Outreach | Pedo | NC |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8779 entries, 0 to 8778 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Posting Date 8779 non-null datetime64[ns] 1 Employee Last name 8779 non-null object 2 Employee First name 8779 non-null object 3 Line Amount 8779 non-null float64 4 Location 8779 non-null object 5 Expense 8779 non-null object 6 Month 8779 non-null int64 7 Month Name 8779 non-null object 8 Expense Category 8779 non-null object 9 Office Type 8779 non-null object 10 State 8779 non-null object dtypes: datetime64[ns](1), float64(1), int64(1), object(8) memory usage: 754.6+ KB
1. Triad Ortho
2. Triad Pedo
3. Coast Pedo
4. Jackson Pedo
5. Jackson Ortho
6. El Paso Ortho
7. El Paso Pedo
8. Tuscon Ortho
9. Tuscon Pedo
10. DMMG
* The rest can be grouped by individual office name
triad_ortho = list(df[(df['State'] == 'NC') & (df['Office Type']=='Ortho')]['Location'].unique())
- doesn't include Wilmington or Greenville
triad_pedo = list(df[(df['State'] == 'NC') & (df['Office Type']=='Pedo')]['Location'].unique())
triad_pedo.pop(-1)
'NC - Greenville Pedo'
triad_pedo.pop(-1)
'NC - Wilmington Pedo'
coast = ['MS - Pascagola Pedo','MS - Gulfport Pedo']
jackson_pedo = ['MS - Woodrow Wilson Pedo','MS - County Line Pedo']
jackson_ortho = ['MS - Woodrow Wilson Ortho']
el_paso_ortho = ['TX - Sunland Ortho','TX - Dyer Ortho','TX - Zaragosa Ortho','TX - Mesa Ortho']
el_paso_pedo = ['TX - Dyer Pedo','TX - Mesa Pedo','TX - Zaragosa Pedo']
tucson_ortho = ['AZ - Prince Ortho','AZ - Country Club Ortho','AZ - Valencia Ortho']
tucson_pedo = ['AZ - Prince Pedo','AZ - Country Club Pedo','AZ - Valencia Pedo']
* and remove state inits + " - " from the rest
def add_groupname(loc):
if 'dmmg' in loc.lower():
return 'DMMG'
elif loc in triad_ortho:
return 'Triad Ortho'
elif loc in triad_pedo:
return 'Triad Pedo'
elif loc in coast:
return 'Coast Pedo'
elif loc in jackson_pedo:
return 'Jackson Pedo'
elif loc in jackson_ortho:
return 'Jackson Ortho'
elif loc in el_paso_pedo:
return 'El Paso Pedo'
elif loc in el_paso_ortho:
return 'El Paso Ortho'
elif loc in tucson_ortho:
return 'Tucson Ortho'
elif loc in tucson_pedo:
return 'Tucson Pedo'
else:
return loc[5:]
df['Group'] = df['Location'].apply(add_groupname)
df['Group'].value_counts()
Triad Pedo 1257 Tucson Ortho 1045 Triad Ortho 923 El Paso Ortho 814 El Paso Pedo 787 Tucson Pedo 713 Jackson Pedo 302 Horn Lake Pedo 258 Tupelo Pedo 239 Tupelo Ortho 232 Ridgeland Ortho 225 Southaven Ortho 213 Coast Pedo 200 Arcadia Pedo 194 Greenville Pedo 194 Greenway Pedo 168 Meridian Ortho 157 Wilmington Pedo 147 Arcadia Ortho 146 Greenway Ortho 117 Meridian Pedo 87 Olathe Pedo 75 Jackson Ortho 63 Midland Pedo 60 DMMG 39 Flagstaff Pedo 35 Meridian Mobile 22 Coastal MS Mobile 18 Tucson Mobile 17 CL/WW Mobile 10 Yuma Mobile 8 Triad Mobile 7 Yuma Ortho 7 Name: Group, dtype: int64
df.to_excel('KDB_Financial_Cleaned.xlsx',sheet_name='clean_data',index=False)