import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sqlalchemy

print(sqlalchemy.__version__)

2.0.48

from sqlalchemy import create_engine, inspect
from sqlalchemy import text
from sqlalchemy.schema import CreateSchema
from sqlalchemy import BigInteger, Column, Integer, String, Float, Boolean
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column

engine = create_engine('postgresql+psycopg2://postgres@localhost/train_reward_compare')

with engine.connect() as conn:
    result = conn.execute(text("select count(*) from onet.occupation_data"))
    print(result.fetchone())

(1016,)

file_nat = '../../raw-data/bls-wage/oesm24nat/national_M2024_dl.xlsx'
file_sta = '../../raw-data/bls-wage/oesm24st/state_M2024_dl.xlsx'
file_met = '../../raw-data/bls-wage/oesm24ma/MSA_M2024_dl.xlsx'
file_bos = '../../raw-data/bls-wage/oesm24ma/BOS_M2024_dl.xlsx'
file_nbs = '../../raw-data/bls-wage/oesm24in4/natsector_M2024_dl.xlsx'

nat = pd.ExcelFile(file_nat)
sta = pd.ExcelFile(file_sta)
met = pd.ExcelFile(file_met)
bos = pd.ExcelFile(file_bos)
nbs = pd.ExcelFile(file_nbs)

print(nat.sheet_names)
print(sta.sheet_names)
print(met.sheet_names)
print(bos.sheet_names)
print(nbs.sheet_names)

['national_M2024_dl', 'Field Descriptions', 'UpdateTime', 'Filler']
['state_M2024_dl', 'Field Descriptions', 'UpdateTime', 'Filler']
['MSA_M2024_dl', 'Field Descriptions', 'UpdateTime', 'Filler']
['BOS_M2024_dl', 'Field Descriptions', 'UpdateTime', 'Filler']
['Natsector_M2024_dl', 'Field Descriptions', 'UpdateTime', 'Filler']

nat.parse('national_M2024_dl').head()

nat_data = nat.parse('national_M2024_dl')

nat_data.columns = nat_data.columns.str.lower()

nat_data.head()

nat_desc_pre = nat.parse('Field Descriptions')

nat_desc_pre.iloc[:15,:]

nat_desc_pre = nat_desc_pre.drop('Unnamed: 2', axis=1)

drop_index = nat_desc_pre.index[0:8]

nat_desc_pre_dropped = nat_desc_pre.drop(drop_index)

nat_desc_pre_dropped.head()

nat_desc_pre_new_header = nat_desc_pre_dropped.iloc[0]

nat_desc_pre_new = nat_desc_pre_dropped[1:]

nat_desc_pre_new.columns = nat_desc_pre_new_header

nat_desc_pre_new = nat_desc_pre_new.reset_index(drop=True)

nat_desc_pre_new.index.name = None

nat_desc_pre_new = nat_desc_pre_new.rename_axis(None, axis=1)

nat_desc_pre_new.head()

nat_desc_pre_new.tail(10)

notes_df = nat_desc_pre_new.iloc[32:,:]

nat_desc = nat_desc_pre_new.iloc[:32, :]

nat_desc.loc[:,'Field'].str.startswith(' ')

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
Name: Field, dtype: bool

nat_desc['Field'].str.endswith(' ')

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30     True
31     True
Name: Field, dtype: bool

nat_desc.loc[:,'Field'] = nat_desc.loc[:,'Field'].str.strip()

nat_desc.loc[:,'Field'].str.startswith(' ')

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
Name: Field, dtype: bool

nat_desc['Field'].str.endswith(' ')

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
Name: Field, dtype: bool

nat_desc

notes_df = notes_df.drop(columns=['Field Description'], axis=1)

notes_df = notes_df.iloc[2:,:]

notes_df = notes_df.rename(columns={'Field':'Notes'})

notes_df

sta.parse('state_M2024_dl').head()

sta_data = sta.parse('state_M2024_dl')

sta_data.columns = sta_data.columns.str.lower()

sta_data.head()

sta_desc = sta.parse('Field Descriptions')

sta_desc.iloc[:15,:]

sta_desc = sta_desc.drop('Unnamed: 2', axis=1)
drop_index = sta_desc.index[0:8]
sta_desc_dropped = sta_desc.drop(drop_index)
sta_desc_new_header = sta_desc_dropped.iloc[0]
sta_desc_new = sta_desc_dropped[1:]
sta_desc_new.columns = sta_desc_new_header
sta_desc_new = sta_desc_new.reset_index(drop=True)
sta_desc_new.index.name = None
sta_desc_new = sta_desc_new.rename_axis(None, axis=1)

sta_desc_new.head()

met_data = met.parse('MSA_M2024_dl')
met_data.columns = met_data.columns.str.lower()

met_data.head()

met_desc = met.parse('Field Descriptions')

met_desc.iloc[:15,:]

met_desc = met_desc.drop('Unnamed: 2', axis=1)
drop_index = met_desc.index[0:8]
met_desc_dropped = met_desc.drop(drop_index)
met_desc_new_header = met_desc_dropped.iloc[0]
met_desc_new = met_desc_dropped[1:]
met_desc_new.columns = met_desc_new_header
met_desc_new = met_desc_new.reset_index(drop=True)
met_desc_new.index.name = None
met_desc_new = met_desc_new.rename_axis(None, axis=1)

met_desc_new.head()

bos_data = bos.parse('BOS_M2024_dl')
bos_data.columns = bos_data.columns.str.lower()

bos_data.head()

bos_desc = bos.parse('Field Descriptions')

bos_desc.iloc[:15,:]

bos_desc = bos_desc.drop('Unnamed: 2', axis=1)
drop_index = bos_desc.index[0:8]
bos_desc_dropped = bos_desc.drop(drop_index)
bos_desc_new_header = bos_desc_dropped.iloc[0]
bos_desc_new = bos_desc_dropped[1:]
bos_desc_new.columns = bos_desc_new_header
bos_desc_new = bos_desc_new.reset_index(drop=True)
bos_desc_new.index.name = None
bos_desc_new = bos_desc_new.rename_axis(None, axis=1)

bos_desc_new.head()

nbs_data = nbs.parse('Natsector_M2024_dl')
nbs_data.columns = nbs_data.columns.str.lower()

nbs_data.head()

nbs_desc = nbs.parse('Field Descriptions')

nbs_desc.iloc[:15,:]

nbs_desc = nbs_desc.drop('Unnamed: 2', axis=1)
drop_index = nbs_desc.index[0:8]
nbs_desc_dropped = nbs_desc.drop(drop_index)
nbs_desc_new_header = nbs_desc_dropped.iloc[0]
nbs_desc_new = nbs_desc_dropped[1:]
nbs_desc_new.columns = nbs_desc_new_header
nbs_desc_new = nbs_desc_new.reset_index(drop=True)
nbs_desc_new.index.name = None
nbs_desc_new = nbs_desc_new.rename_axis(None, axis=1)

nbs_desc_new.head()

nat_data.dtypes

area              int64
area_title       object
area_type         int64
prim_state       object
naics             int64
naics_title      object
i_group          object
own_code          int64
occ_code         object
occ_title        object
o_group          object
tot_emp           int64
emp_prse        float64
jobs_1000       float64
loc_quotient    float64
pct_total       float64
pct_rpt         float64
h_mean           object
a_mean           object
mean_prse       float64
h_pct10          object
h_pct25          object
h_median         object
h_pct75          object
h_pct90          object
a_pct10          object
a_pct25          object
a_median         object
a_pct75          object
a_pct90          object
annual           object
hourly           object
dtype: object

nat_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1403 entries, 0 to 1402
Data columns (total 32 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area          1403 non-null   int64  
 1   area_title    1403 non-null   object 
 2   area_type     1403 non-null   int64  
 3   prim_state    1403 non-null   object 
 4   naics         1403 non-null   int64  
 5   naics_title   1403 non-null   object 
 6   i_group       1403 non-null   object 
 7   own_code      1403 non-null   int64  
 8   occ_code      1403 non-null   object 
 9   occ_title     1403 non-null   object 
 10  o_group       1403 non-null   object 
 11  tot_emp       1403 non-null   int64  
 12  emp_prse      1403 non-null   float64
 13  jobs_1000     0 non-null      float64
 14  loc_quotient  0 non-null      float64
 15  pct_total     0 non-null      float64
 16  pct_rpt       0 non-null      float64
 17  h_mean        1403 non-null   object 
 18  a_mean        1403 non-null   object 
 19  mean_prse     1403 non-null   float64
 20  h_pct10       1403 non-null   object 
 21  h_pct25       1403 non-null   object 
 22  h_median      1403 non-null   object 
 23  h_pct75       1403 non-null   object 
 24  h_pct90       1403 non-null   object 
 25  a_pct10       1403 non-null   object 
 26  a_pct25       1403 non-null   object 
 27  a_median      1403 non-null   object 
 28  a_pct75       1403 non-null   object 
 29  a_pct90       1403 non-null   object 
 30  annual        83 non-null     object 
 31  hourly        7 non-null      object 
dtypes: float64(6), int64(5), object(21)
memory usage: 350.9+ KB

with pd.option_context('display.max_colwidth', None):
    print(nat_desc.iloc[13:17,:])

           Field  \
13     jobs_1000   
14  loc quotient   
15     pct_total   
16       pct_rpt   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      Field Description  
13                                                                                                                                                                                                                                                                                                                          The number of jobs (employment) in the given occupation per 1,000 jobs in the given area. Only available for the state and MSA estimates; otherwise, this column is blank.   
14  The location quotient represents the ratio of an occupation’s share of employment in a given area to that occupation’s share of employment in the U.S. as a whole. For example, an occupation that makes up 10 percent of employment in a specific metropolitan area compared with 2 percent of U.S. employment would have a location quotient of 5 for the area in question. Only available for the state, metropolitan area, and nonmetropolitan area estimates; otherwise, this column is blank.  
15                                                                                                                                                                                                                                Percent of industry employment in the given occupation. Percents may not sum to 100 because the totals may include data for occupations that could not be published separately. Only available for the national industry estimates; otherwise, this column is blank.   
16                                                                                                                                                                                                                                                                                                                                         Percent of establishments reporting the given occupation for the cell. Only available for the national industry estimates; otherwise, this column is blank.

drop_nat_data_list = ['jobs_1000', 'loc_quotient', 'pct_total', 'pct_rpt']

nat_data = nat_data.drop(drop_nat_data_list, axis=1)

nat_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1403 entries, 0 to 1402
Data columns (total 28 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   area         1403 non-null   int64  
 1   area_title   1403 non-null   object 
 2   area_type    1403 non-null   int64  
 3   prim_state   1403 non-null   object 
 4   naics        1403 non-null   int64  
 5   naics_title  1403 non-null   object 
 6   i_group      1403 non-null   object 
 7   own_code     1403 non-null   int64  
 8   occ_code     1403 non-null   object 
 9   occ_title    1403 non-null   object 
 10  o_group      1403 non-null   object 
 11  tot_emp      1403 non-null   int64  
 12  emp_prse     1403 non-null   float64
 13  h_mean       1403 non-null   object 
 14  a_mean       1403 non-null   object 
 15  mean_prse    1403 non-null   float64
 16  h_pct10      1403 non-null   object 
 17  h_pct25      1403 non-null   object 
 18  h_median     1403 non-null   object 
 19  h_pct75      1403 non-null   object 
 20  h_pct90      1403 non-null   object 
 21  a_pct10      1403 non-null   object 
 22  a_pct25      1403 non-null   object 
 23  a_median     1403 non-null   object 
 24  a_pct75      1403 non-null   object 
 25  a_pct90      1403 non-null   object 
 26  annual       83 non-null     object 
 27  hourly       7 non-null      object 
dtypes: float64(2), int64(5), object(21)
memory usage: 307.0+ KB

nat_desc = nat_desc[nat_desc['Field'].isin(drop_nat_data_list) == False]

nat_desc

nat_data['area'].describe()

count    1403.0
mean       99.0
std         0.0
min        99.0
25%        99.0
50%        99.0
75%        99.0
max        99.0
Name: area, dtype: float64

print(nat_desc.loc[nat_desc['Field'] == 'area', 'Field Description'].values[0])

U.S. (99), state FIPS code, Metropolitan Statistical Area (MSA) code, or OEWS-specific nonmetropolitan area code

nat_data['area_title'].head()

0    U.S.
1    U.S.
2    U.S.
3    U.S.
4    U.S.
Name: area_title, dtype: object

nat_data['area_title'].unique()

array(['U.S.'], dtype=object)

nat_data['area_type'].unique()

array([1])

nat_data['prim_state'].unique()

array(['US'], dtype=object)

nat_data['naics'].unique()

array([0])

print(nat_desc.loc[nat_desc['Field'] == 'naics', 'Field Description'].values[0])

North American Industry Classification System (NAICS) code for the given industry

nat_data['naics_title'].unique()

array(['Cross-industry'], dtype=object)

print(nat_desc.loc[nat_desc['Field'] == 'naics_title', 'Field Description'].values[0])

North American Industry Classification System (NAICS) title for the given industry

nat_data['i_group'].unique()

array(['cross-industry'], dtype=object)

print(nat_desc.loc[nat_desc['Field'] == 'i_group', 'Field Description'].values[0])

Industry level. Indicates cross-industry or NAICS sector, 3-digit, 4-digit, 5-digit, or 6-digit industry. For industries that OEWS no longer publishes at the 4-digit NAICS level, the “4-digit” designation indicates the most detailed industry breakdown available: either a standard NAICS 3-digit industry or an OEWS-specific combination of 4-digit industries. Industries that OEWS has aggregated to the 3-digit NAICS level (for example, NAICS 327000) will appear twice, once with the “3-digit” and once with the “4-digit” designation.

nat_data['own_code'].unique()

array([1235])

print(nat_desc.loc[nat_desc['Field'] == 'own_code', 'Field Description'].values[0])

Ownership type: 1= Federal Government; 2= State Government; 3= Local Government; 123= Federal, State, and Local Government; 235=Private, State, and Local Government; 35 = Private and Local Government; 5= Private; 57=Private, Local Government Gambling Establishments (Sector 71), and Local Government Casino Hotels (Sector 72); 58= Private plus State and Local Government Hospitals; 59= Private and Postal Service; 1235= Federal, State, and Local Government and Private Sector

nat_data['occ_code'].unique()

array(['00-0000', '11-0000', '11-1000', ..., '53-7121', '53-7190',
       '53-7199'], dtype=object)

print(nat_desc.loc[nat_desc['Field'] == 'occ_code', 'Field Description'].values[0])

The 6-digit Standard Occupational Classification (SOC) code or OEWS-specific code for the occupation

nat_data['occ_code'].describe()

count        1403
unique       1396
top       31-1120
freq            2
Name: occ_code, dtype: object

nat_data['occ_code'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1403 entries, 0 to 1402
Series name: occ_code
Non-Null Count  Dtype 
--------------  ----- 
1403 non-null   object
dtypes: object(1)
memory usage: 11.1+ KB

print(type(nat_data['occ_code'][0]))

<class 'str'>

counts = nat_data['occ_code'].value_counts()

counts

occ_code
31-1120    2
47-4090    2
29-2010    2
51-2090    2
39-7010    2
          ..
27-2012    1
27-2011    1
27-2010    1
27-2000    1
53-7199    1
Name: count, Length: 1396, dtype: int64

threshold = 1

counts_thresh = counts[counts > threshold].index

counts_thresh

Index(['31-1120', '47-4090', '29-2010', '51-2090', '39-7010', '13-1020',
       '13-2020'],
      dtype='object', name='occ_code')

nat_data[nat_data.groupby('occ_code')['occ_code'].transform('count') > threshold]

nat_data['o_group'].value_counts()

o_group
detailed    831
broad       455
minor        94
major        22
total         1
Name: count, dtype: int64

nat_data = nat_data.drop_duplicates()

counts = nat_data['occ_code'].value_counts()

counts

occ_code
31-1120    2
47-4090    2
29-2010    2
51-2090    2
39-7010    2
          ..
27-2012    1
27-2011    1
27-2010    1
27-2000    1
53-7199    1
Name: count, Length: 1396, dtype: int64

with pd.option_context('display.max_columns', None):
    print(nat_data[nat_data.groupby('occ_code')['occ_code'].transform('count') > threshold])

      area area_title  area_type prim_state  naics     naics_title  \
78      99       U.S.          1         US      0  Cross-industry   
79      99       U.S.          1         US      0  Cross-industry   
111     99       U.S.          1         US      0  Cross-industry   
112     99       U.S.          1         US      0  Cross-industry   
573     99       U.S.          1         US      0  Cross-industry   
574     99       U.S.          1         US      0  Cross-industry   
612     99       U.S.          1         US      0  Cross-industry   
613     99       U.S.          1         US      0  Cross-industry   
779     99       U.S.          1         US      0  Cross-industry   
780     99       U.S.          1         US      0  Cross-industry   
1045    99       U.S.          1         US      0  Cross-industry   
1046    99       U.S.          1         US      0  Cross-industry   
1163    99       U.S.          1         US      0  Cross-industry   
1164    99       U.S.          1         US      0  Cross-industry   

             i_group  own_code occ_code  \
78    cross-industry      1235  13-1020   
79    cross-industry      1235  13-1020   
111   cross-industry      1235  13-2020   
112   cross-industry      1235  13-2020   
573   cross-industry      1235  29-2010   
574   cross-industry      1235  29-2010   
612   cross-industry      1235  31-1120   
613   cross-industry      1235  31-1120   
779   cross-industry      1235  39-7010   
780   cross-industry      1235  39-7010   
1045  cross-industry      1235  47-4090   
1046  cross-industry      1235  47-4090   
1163  cross-industry      1235  51-2090   
1164  cross-industry      1235  51-2090   

                                              occ_title   o_group  tot_emp  \
78                         Buyers and Purchasing Agents     broad   486900   
79                         Buyers and Purchasing Agents  detailed   486900   
111                   Property Appraisers and Assessors     broad    59070   
112                   Property Appraisers and Assessors  detailed    59070   
573   Clinical Laboratory Technologists and Technicians     broad   343040   
574   Clinical Laboratory Technologists and Technicians  detailed   343040   
612                 Home Health and Personal Care Aides     broad  3988140   
613                 Home Health and Personal Care Aides  detailed  3988140   
779                              Tour and Travel Guides     broad    49010   
780                              Tour and Travel Guides  detailed    49010   
1045     Miscellaneous Construction and Related Workers     broad    33530   
1046     Miscellaneous Construction and Related Workers  detailed    33530   
1163           Miscellaneous Assemblers and Fabricators     broad  1457800   
1164           Miscellaneous Assemblers and Fabricators  detailed  1457800   

      emp_prse h_mean a_mean  mean_prse h_pct10 h_pct25 h_median h_pct75  \
78         0.6  39.29  81720        0.4   22.34   28.21    36.37   47.69   
79         0.6  39.29  81720        0.4   22.34   28.21    36.37   47.69   
111        2.7   37.1  77160        2.5    18.5   23.71    31.45   43.66   
112        2.7   37.1  77160        2.5    18.5   23.71    31.45   43.66   
573        1.0  31.41  65320        0.3   18.28   22.39    29.75   38.46   
574        1.0  31.41  65320        0.3   18.28   22.39    29.75   38.46   
612        0.3  16.82  34990        0.3   12.31    14.6    16.78   18.26   
613        0.3  16.82  34990        0.3   12.31    14.6    16.78   18.26   
779        2.7  20.72  43090        3.3   12.93   15.03    17.63   22.07   
780        2.7  20.72  43090        3.3   12.93   15.03    17.63   22.07   
1045       4.8  25.44  52910        1.9    16.9   19.23    23.14   29.31   
1046       4.8  25.44  52910        1.9    16.9   19.23    23.14   29.31   
1163       0.5  21.57  44860        0.2   15.22   17.62    20.29   23.76   
1164       0.5  21.57  44860        0.2   15.22   17.62    20.29   23.76   

     h_pct90 a_pct10 a_pct25 a_median a_pct75 a_pct90 annual hourly  
78     61.31   46460   58670    75650   99190  127520    NaN    NaN  
79     61.31   46460   58670    75650   99190  127520    NaN    NaN  
111    59.02   38480   49310    65420   90810  122760    NaN    NaN  
112    59.02   38480   49310    65420   90810  122760    NaN    NaN  
573    47.11   38020   46580    61890   80010   97990    NaN    NaN  
574    47.11   38020   46580    61890   80010   97990    NaN    NaN  
612    21.25   25600   30370    34900   37980   44190    NaN    NaN  
613    21.25   25600   30370    34900   37980   44190    NaN    NaN  
779    28.81   26890   31250    36660   45910   59930    NaN    NaN  
780    28.81   26890   31250    36660   45910   59930    NaN    NaN  
1045   37.28   35160   39990    48120   60960   77540    NaN    NaN  
1046   37.28   35160   39990    48120   60960   77540    NaN    NaN  
1163   29.82   31650   36660    42210   49410   62030    NaN    NaN  
1164   29.82   31650   36660    42210   49410   62030    NaN    NaN

print(nat_desc.loc[nat_desc['Field'] == 'o_group', 'Field Description'].values[0])

SOC occupation level. For most occupations, this field indicates the standard SOC major, minor, broad, and detailed levels, in addition to all-occupations totals. For occupations that OEWS no longer publishes at the SOC detailed level, the “detailed” designation indicates the most detailed data available: either a standard SOC broad occupation or an OEWS-specific combination of detailed occupations. Occupations that OEWS has aggregated to the SOC broad occupation level will appear in the file twice, once with the “broad” and once with the “detailed” designation.

nat_data_filtered = nat_data[(nat_data['occ_code'].isin(counts_thresh)) & (nat_data['o_group'] == 'detailed')]

nat_data_filtered

nat_data_filtered['o_group']

79      detailed
112     detailed
574     detailed
613     detailed
780     detailed
1046    detailed
1164    detailed
Name: o_group, dtype: object

nat_data_pre_drop = nat_data.drop(nat_data_filtered.index)

nat_data_pre_drop.head()

nat_data_pre_drop.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1396 entries, 0 to 1402
Data columns (total 28 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   area         1396 non-null   int64  
 1   area_title   1396 non-null   object 
 2   area_type    1396 non-null   int64  
 3   prim_state   1396 non-null   object 
 4   naics        1396 non-null   int64  
 5   naics_title  1396 non-null   object 
 6   i_group      1396 non-null   object 
 7   own_code     1396 non-null   int64  
 8   occ_code     1396 non-null   object 
 9   occ_title    1396 non-null   object 
 10  o_group      1396 non-null   object 
 11  tot_emp      1396 non-null   int64  
 12  emp_prse     1396 non-null   float64
 13  h_mean       1396 non-null   object 
 14  a_mean       1396 non-null   object 
 15  mean_prse    1396 non-null   float64
 16  h_pct10      1396 non-null   object 
 17  h_pct25      1396 non-null   object 
 18  h_median     1396 non-null   object 
 19  h_pct75      1396 non-null   object 
 20  h_pct90      1396 non-null   object 
 21  a_pct10      1396 non-null   object 
 22  a_pct25      1396 non-null   object 
 23  a_median     1396 non-null   object 
 24  a_pct75      1396 non-null   object 
 25  a_pct90      1396 non-null   object 
 26  annual       83 non-null     object 
 27  hourly       7 non-null      object 
dtypes: float64(2), int64(5), object(21)
memory usage: 316.3+ KB

nat_data_pre_drop[nat_data_pre_drop.groupby('occ_code')['occ_code'].transform('count') > threshold]

nat_data = nat_data_pre_drop

nat_data['occ_title'].unique()

array(['All Occupations', 'Management Occupations', 'Top Executives', ...,
       'Tank Car, Truck, and Ship Loaders',
       'Miscellaneous Material Moving Workers',
       'Material Moving Workers, All Other'], dtype=object)

print(nat_desc.loc[nat_desc['Field'] == 'occ_title', 'Field Description'].values[0])

SOC title or OEWS-specific title for the occupation

nat_data['occ_title'].describe()

count                   1396
unique                  1138
top       Massage Therapists
freq                       2
Name: occ_title, dtype: object

nat_data['occ_title'].info()

<class 'pandas.core.series.Series'>
Index: 1396 entries, 0 to 1402
Series name: occ_title
Non-Null Count  Dtype 
--------------  ----- 
1396 non-null   object
dtypes: object(1)
memory usage: 21.8+ KB

print(type(nat_data['occ_title'][0]))

<class 'str'>

counts = nat_data['occ_title'].value_counts()

counts

occ_title
Massage Therapists                                 2
Biological Technicians                             2
Couriers and Messengers                            2
Cargo and Freight Agents                           2
Printing Workers                                   2
                                                  ..
Healthcare Diagnosing or Treating Practitioners    1
Dentists                                           1
Dentists, General                                  1
Oral and Maxillofacial Surgeons                    1
Material Moving Workers, All Other                 1
Name: count, Length: 1138, dtype: int64

nat_data[nat_data.groupby('occ_title')['occ_title'].transform('count') > threshold]

with pd.option_context('display.max_columns', None):
    print(nat_data[nat_data.groupby('occ_title')['occ_title'].transform('count') > threshold].head(2))

   area area_title  area_type prim_state  naics     naics_title  \
3    99       U.S.          1         US      0  Cross-industry   
4    99       U.S.          1         US      0  Cross-industry   

          i_group  own_code occ_code         occ_title   o_group  tot_emp  \
3  cross-industry      1235  11-1010  Chief Executives     broad   211850   
4  cross-industry      1235  11-1011  Chief Executives  detailed   211850   

   emp_prse  h_mean  a_mean  mean_prse h_pct10 h_pct25 h_median h_pct75  \
3       1.2  126.41  262930        0.9   35.44   60.61    99.24       #   
4       1.2  126.41  262930        0.9   35.44   60.61    99.24       #   

  h_pct90 a_pct10 a_pct25 a_median a_pct75 a_pct90 annual hourly  
3       #   73710  126080   206420       #       #    NaN    NaN  
4       #   73710  126080   206420       #       #    NaN    NaN

counts_thresh = counts[counts > threshold].index

nat_data_filtered = nat_data[(nat_data['occ_title'].isin(counts_thresh)) & (nat_data['o_group'] == 'detailed')]

nat_data_filtered

nat_data_filtered['o_group'].unique()

array(['detailed'], dtype=object)

nat_data_pre_drop = nat_data.drop(nat_data_filtered.index)

nat_data_pre_drop.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1147 entries, 0 to 1402
Data columns (total 28 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   area         1147 non-null   int64  
 1   area_title   1147 non-null   object 
 2   area_type    1147 non-null   int64  
 3   prim_state   1147 non-null   object 
 4   naics        1147 non-null   int64  
 5   naics_title  1147 non-null   object 
 6   i_group      1147 non-null   object 
 7   own_code     1147 non-null   int64  
 8   occ_code     1147 non-null   object 
 9   occ_title    1147 non-null   object 
 10  o_group      1147 non-null   object 
 11  tot_emp      1147 non-null   int64  
 12  emp_prse     1147 non-null   float64
 13  h_mean       1147 non-null   object 
 14  a_mean       1147 non-null   object 
 15  mean_prse    1147 non-null   float64
 16  h_pct10      1147 non-null   object 
 17  h_pct25      1147 non-null   object 
 18  h_median     1147 non-null   object 
 19  h_pct75      1147 non-null   object 
 20  h_pct90      1147 non-null   object 
 21  a_pct10      1147 non-null   object 
 22  a_pct25      1147 non-null   object 
 23  a_median     1147 non-null   object 
 24  a_pct75      1147 non-null   object 
 25  a_pct90      1147 non-null   object 
 26  annual       80 non-null     object 
 27  hourly       7 non-null      object 
dtypes: float64(2), int64(5), object(21)
memory usage: 259.9+ KB

counts = nat_data_pre_drop['occ_title'].value_counts()

counts

occ_title
Supervisors of Food Preparation and Serving Workers    2
Baggage Porters, Bellhops, and Concierges              2
Sales Representatives, Wholesale and Manufacturing     2
Grounds Maintenance Workers                            2
Tour and Travel Guides                                 2
                                                      ..
Athletes, Coaches, Umpires, and Related Workers        1
Athletes and Sports Competitors                        1
Coaches and Scouts                                     1
Umpires, Referees, and Other Sports Officials          1
Material Moving Workers, All Other                     1
Name: count, Length: 1138, dtype: int64

nat_data_pre_drop[nat_data_pre_drop.groupby('occ_title')['occ_title'].transform('count') > threshold]

with pd.option_context('display.max_columns', None):
    print(nat_data_pre_drop[nat_data_pre_drop.groupby('occ_title')['occ_title'].transform('count') > threshold])

      area area_title  area_type prim_state  naics     naics_title  \
304     99       U.S.          1         US      0  Cross-industry   
305     99       U.S.          1         US      0  Cross-industry   
681     99       U.S.          1         US      0  Cross-industry   
682     99       U.S.          1         US      0  Cross-industry   
725     99       U.S.          1         US      0  Cross-industry   
726     99       U.S.          1         US      0  Cross-industry   
774     99       U.S.          1         US      0  Cross-industry   
775     99       U.S.          1         US      0  Cross-industry   
778     99       U.S.          1         US      0  Cross-industry   
779     99       U.S.          1         US      0  Cross-industry   
816     99       U.S.          1         US      0  Cross-industry   
817     99       U.S.          1         US      0  Cross-industry   
917     99       U.S.          1         US      0  Cross-industry   
918     99       U.S.          1         US      0  Cross-industry   
1021    99       U.S.          1         US      0  Cross-industry   
1022    99       U.S.          1         US      0  Cross-industry   
1212    99       U.S.          1         US      0  Cross-industry   
1213    99       U.S.          1         US      0  Cross-industry   

             i_group  own_code occ_code  \
304   cross-industry      1235  19-5000   
305   cross-industry      1235  19-5010   
681   cross-industry      1235  35-1000   
682   cross-industry      1235  35-1010   
725   cross-industry      1235  37-3000   
726   cross-industry      1235  37-3010   
774   cross-industry      1235  39-6000   
775   cross-industry      1235  39-6010   
778   cross-industry      1235  39-7000   
779   cross-industry      1235  39-7010   
816   cross-industry      1235  41-4000   
817   cross-industry      1235  41-4010   
917   cross-industry      1235  43-6000   
918   cross-industry      1235  43-6010   
1021  cross-industry      1235  47-3000   
1022  cross-industry      1235  47-3010   
1212  cross-industry      1235  51-5100   
1213  cross-industry      1235  51-5110   

                                              occ_title o_group  tot_emp  \
304   Occupational Health and Safety Specialists and...   minor   159880   
305   Occupational Health and Safety Specialists and...   broad   159880   
681   Supervisors of Food Preparation and Serving Wo...   minor  1369780   
682   Supervisors of Food Preparation and Serving Wo...   broad  1369780   
725                         Grounds Maintenance Workers   minor  1030070   
726                         Grounds Maintenance Workers   broad  1030070   
774           Baggage Porters, Bellhops, and Concierges   minor    75430   
775           Baggage Porters, Bellhops, and Concierges   broad    75430   
778                              Tour and Travel Guides   minor    49010   
779                              Tour and Travel Guides   broad    49010   
816   Sales Representatives, Wholesale and Manufactu...   minor  1560790   
817   Sales Representatives, Wholesale and Manufactu...   broad  1560790   
917           Secretaries and Administrative Assistants   minor  3195900   
918           Secretaries and Administrative Assistants   broad  3195900   
1021                       Helpers, Construction Trades   minor   186250   
1022                       Helpers, Construction Trades   broad   186250   
1212                                   Printing Workers   minor   204640   
1213                                   Printing Workers   broad   204640   

      emp_prse h_mean a_mean  mean_prse h_pct10 h_pct25 h_median h_pct75  \
304        1.5  40.27  83770        0.6   22.98   28.66    37.93   48.79   
305        1.5  40.27  83770        0.6   22.98   28.66    37.93   48.79   
681        0.7  22.85  47540        0.5   14.23   17.23    21.22    27.4   
682        0.7  22.85  47540        0.5   14.23   17.23    21.22    27.4   
725        0.5  20.07  41750        0.3   14.49   17.05     18.5    22.3   
726        0.5  20.07  41750        0.3   14.49   17.05     18.5    22.3   
774        3.3  18.84  39190        0.9   13.63   15.65     17.7   20.96   
775        3.3  18.84  39190        0.9   13.63   15.65     17.7   20.96   
778        2.7  20.72  43090        3.3   12.93   15.03    17.63   22.07   
779        2.7  20.72  43090        3.3   12.93   15.03    17.63   22.07   
816        0.5  42.16  87690        0.5   18.71   24.43    35.63   49.61   
817        0.5  42.16  87690        0.5   18.71   24.43    35.63   49.61   
917        0.7  25.03  52070        0.2   16.27   18.65    22.82   28.84   
918        0.7  25.03  52070        0.2   16.27   18.65    22.82   28.84   
1021       1.6  20.74  43140        0.7   15.08   17.44    19.44   22.94   
1022       1.6  20.74  43140        0.7   15.08   17.44    19.44   22.94   
1212       1.0  22.24  46260        0.3   15.12   17.76    21.55   25.24   
1213       1.0  22.24  46260        0.3   15.12   17.76    21.55   25.24   

     h_pct90 a_pct10 a_pct25 a_median a_pct75 a_pct90 annual hourly  
304    61.13   47790   59620    78900  101490  127140    NaN    NaN  
305    61.13   47790   59620    78900  101490  127140    NaN    NaN  
681    34.58   29600   35840    44140   56990   71920    NaN    NaN  
682    34.58   29600   35840    44140   56990   71920    NaN    NaN  
725    27.14   30140   35470    38470   46390   56460    NaN    NaN  
726    27.14   30140   35470    38470   46390   56460    NaN    NaN  
774    26.13   28360   32560    36810   43600   54340    NaN    NaN  
775    26.13   28360   32560    36810   43600   54340    NaN    NaN  
778    28.81   26890   31250    36660   45910   59930    NaN    NaN  
779    28.81   26890   31250    36660   45910   59930    NaN    NaN  
816    74.97   38910   50820    74100  103200  155930    NaN    NaN  
817    74.97   38910   50820    74100  103200  155930    NaN    NaN  
917    36.81   33840   38790    47460   59990   76550    NaN    NaN  
918    36.81   33840   38790    47460   59990   76550    NaN    NaN  
1021   27.64   31360   36280    40430   47710   57480    NaN    NaN  
1022   27.64   31360   36280    40430   47710   57480    NaN    NaN  
1212   30.06   31450   36950    44830   52510   62530    NaN    NaN  
1213   30.06   31450   36950    44830   52510   62530    NaN    NaN

print(nat_desc.loc[nat_desc['Field'] == 'o_group', 'Field Description'].values[0])

SOC occupation level. For most occupations, this field indicates the standard SOC major, minor, broad, and detailed levels, in addition to all-occupations totals. For occupations that OEWS no longer publishes at the SOC detailed level, the “detailed” designation indicates the most detailed data available: either a standard SOC broad occupation or an OEWS-specific combination of detailed occupations. Occupations that OEWS has aggregated to the SOC broad occupation level will appear in the file twice, once with the “broad” and once with the “detailed” designation.

counts_thresh = counts[counts > threshold].index

nat_data_filtered = nat_data_pre_drop[(nat_data_pre_drop['occ_title'].isin(counts_thresh)) & (nat_data_pre_drop['o_group'] == 'broad')]

nat_data_filtered

nat_data_pre_drop_2 = nat_data_pre_drop.drop(nat_data_filtered.index)

nat_data_pre_drop_2

(1396 - 1138) * 2

516

nat_data_pre_drop_2[nat_data_pre_drop_2.groupby('occ_title')['occ_title'].transform('count') > threshold]

nat_data_pre_drop_2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1138 entries, 0 to 1402
Data columns (total 28 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   area         1138 non-null   int64  
 1   area_title   1138 non-null   object 
 2   area_type    1138 non-null   int64  
 3   prim_state   1138 non-null   object 
 4   naics        1138 non-null   int64  
 5   naics_title  1138 non-null   object 
 6   i_group      1138 non-null   object 
 7   own_code     1138 non-null   int64  
 8   occ_code     1138 non-null   object 
 9   occ_title    1138 non-null   object 
 10  o_group      1138 non-null   object 
 11  tot_emp      1138 non-null   int64  
 12  emp_prse     1138 non-null   float64
 13  h_mean       1138 non-null   object 
 14  a_mean       1138 non-null   object 
 15  mean_prse    1138 non-null   float64
 16  h_pct10      1138 non-null   object 
 17  h_pct25      1138 non-null   object 
 18  h_median     1138 non-null   object 
 19  h_pct75      1138 non-null   object 
 20  h_pct90      1138 non-null   object 
 21  a_pct10      1138 non-null   object 
 22  a_pct25      1138 non-null   object 
 23  a_median     1138 non-null   object 
 24  a_pct75      1138 non-null   object 
 25  a_pct90      1138 non-null   object 
 26  annual       80 non-null     object 
 27  hourly       7 non-null      object 
dtypes: float64(2), int64(5), object(21)
memory usage: 257.8+ KB

nat_data = nat_data_pre_drop_2

nat_data['o_group'].unique()

array(['total', 'major', 'minor', 'broad', 'detailed'], dtype=object)

nat_data['o_group'].value_counts()

o_group
detailed    575
broad       446
minor        94
major        22
total         1
Name: count, dtype: int64

print(nat_desc.loc[nat_desc['Field'] == 'o_group', 'Field Description'].values[0])

SOC occupation level. For most occupations, this field indicates the standard SOC major, minor, broad, and detailed levels, in addition to all-occupations totals. For occupations that OEWS no longer publishes at the SOC detailed level, the “detailed” designation indicates the most detailed data available: either a standard SOC broad occupation or an OEWS-specific combination of detailed occupations. Occupations that OEWS has aggregated to the SOC broad occupation level will appear in the file twice, once with the “broad” and once with the “detailed” designation.

nat_data['tot_emp'].duplicated().sum()

np.int64(68)

with pd.option_context('display.max_columns', None):
    print(nat_data[nat_data.groupby('tot_emp')['tot_emp'].transform('count') > threshold].sort_values(by='tot_emp', ascending=False))

      area area_title  area_type prim_state  naics     naics_title  \
835     99       U.S.          1         US      0  Cross-industry   
836     99       U.S.          1         US      0  Cross-industry   
793     99       U.S.          1         US      0  Cross-industry   
792     99       U.S.          1         US      0  Cross-industry   
815     99       U.S.          1         US      0  Cross-industry   
...    ...        ...        ...        ...    ...             ...   
1005    99       U.S.          1         US      0  Cross-industry   
844     99       U.S.          1         US      0  Cross-industry   
843     99       U.S.          1         US      0  Cross-industry   
566     99       U.S.          1         US      0  Cross-industry   
269     99       U.S.          1         US      0  Cross-industry   

             i_group  own_code occ_code  \
835   cross-industry      1235  43-1000   
836   cross-industry      1235  43-1010   
793   cross-industry      1235  41-1010   
792   cross-industry      1235  41-1000   
815   cross-industry      1235  41-3091   
...              ...       ...      ...   
1005  cross-industry      1235  47-2142   
844   cross-industry      1235  43-2099   
843   cross-industry      1235  43-2090   
566   cross-industry      1235  29-1243   
269   cross-industry      1235  19-3032   

                                              occ_title   o_group  tot_emp  \
835   Supervisors of Office and Administrative Suppo...     minor  1495580   
836   First-Line Supervisors of Office and Administr...     broad  1495580   
793             First-Line Supervisors of Sales Workers     broad  1332180   
792                        Supervisors of Sales Workers     minor  1332180   
815   Sales Representatives of Services, Except Adve...  detailed  1189330   
...                                                 ...       ...      ...   
1005                                       Paperhangers  detailed     1520   
844       Communications Equipment Operators, All Other  detailed     1390   
843    Miscellaneous Communications Equipment Operators     broad     1390   
566                                  Pediatric Surgeons  detailed     1050   
269             Industrial-Organizational Psychologists  detailed     1050   

      emp_prse  h_mean  a_mean  mean_prse h_pct10 h_pct25 h_median h_pct75  \
835        0.4    34.4   71560        0.2   21.12   25.57     31.8   39.59   
836        0.4    34.4   71560        0.2   21.12   25.57     31.8   39.59   
793        0.4   28.77   59830        0.3   15.69   18.61    23.81   32.19   
792        0.4   28.77   59830        0.3   15.69   18.61    23.81   32.19   
815        0.7   39.07   81260        0.5   17.76   22.65    31.86   47.49   
...        ...     ...     ...        ...     ...     ...      ...     ...   
1005      13.7   24.95   51900        3.9   16.84   19.68     23.2   28.11   
844        9.9   26.87   55890        2.1    17.3   21.45       24   30.92   
843        9.9   26.87   55890        2.1    17.3   21.45       24   30.92   
566       23.4  216.74  450810        5.4   91.21       #        #       #   
269       24.7   64.62  134400       11.6   24.94   38.84    52.81   95.27   

     h_pct90 a_pct10 a_pct25 a_median a_pct75 a_pct90 annual hourly  
835    49.51   43920   53190    66140   82340  102980    NaN    NaN  
836    49.51   43920   53190    66140   82340  102980    NaN    NaN  
793    45.76   32630   38710    49510   66960   95170    NaN    NaN  
792    45.76   32630   38710    49510   66960   95170    NaN    NaN  
815    68.29   36930   47120    66260   98780  142040    NaN    NaN  
...      ...     ...     ...      ...     ...     ...    ...    ...  
1005    33.4   35020   40930    48260   58470   69470    NaN    NaN  
844    42.84   35980   44620    49910   64310   89110    NaN    NaN  
843    42.84   35980   44620    49910   64310   89110    NaN    NaN  
566        #  189720       #        #       #       #    NaN    NaN  
269   107.97   51880   80790   109840  198170  224590    NaN    NaN  

[130 rows x 28 columns]

nat_data_filtered = nat_data[nat_data.groupby('tot_emp')['tot_emp'].transform('count') > threshold].sort_values(by='tot_emp', ascending=False)

nat_data_filtered

col_lim_tot_emp = [
    'area',
    'area_title', 
    'area_type',
    'prim_state',
    'naics',
    'naics_title',
    'i_group',
    'own_code',
    'mean_prse',
    'h_pct10',
    'h_pct25',
    'h_median',
    'h_pct75', 
    'h_pct90',
    'a_pct10',
    'a_pct25',
    'a_median',
    'a_pct75', 
    'a_pct90',
    'annual',
    'hourly'
]

nat_data_filtered = nat_data_filtered.drop(columns=col_lim_tot_emp)

nat_data_filtered

str_depth = 5

# From the filtered dataset of duplicates, capture the count of the broad-level categories
# Return the series as a dataframe
first_str = pd.DataFrame(nat_data_filtered['occ_code'].str[:str_depth].value_counts())

with pd.option_context('display.max_rows', None):
    print(first_str)

          count
occ_code       
27-30         4
51-40         4
49-90         4
17-21         4
19-20         3
27-20         3
19-10         3
47-50         3
25-30         3
43-91         3
27-40         3
51-70         3
15-12         3
19-40         3
11-91         3
19-30         3
29-12         3
37-10         3
45-10         2
43-30         2
29-10         2
43-10         2
53-71         2
53-30         2
51-91         2
21-20         2
53-60         2
27-10         2
15-20         2
53-40         2
17-30         2
43-20         2
41-10         2
47-10         2
39-90         2
53-10         2
13-20         2
51-10         2
25-90         2
49-10         2
35-90         2
51-41         2
13-11         2
41-30         2
43-41         2
51-80         1
33-90         1
39-40         1
47-21         1
51-20         1
51-30         1
47-20         1
25-20         1
11-30         1
17-20         1
35-20         1
53-50         1
51-90         1
29-90         1
21-10         1
17-10         1
41-90         1
11-90         1

# From the list of rows of the broad-level occ_code's
# capture those that only appear once
# Return the index of the resulting series as a dataframe 
first_str_lim = pd.DataFrame(first_str[first_str['count'] == threshold].index)

first_str_lim.sort_values(by='occ_code', ascending=False)

# Create a dataframe to hold a list of the appropriate length broad_level rows
broad_level_list = pd.DataFrame(nat_data_filtered['occ_code'].str[:str_depth])

# Re-filter nat_data_filtered to capture only those rows where the broad-level code appears once 
broad_level_list_one = nat_data_filtered[broad_level_list['occ_code'].isin(first_str_lim['occ_code'])].sort_values(by='occ_code')

broad_level_list_one

broad_level_list_one.shape

(18, 7)

# Look in the first and second filtered df's to see where the 'tot_emp' values match
nat_data_filtered[nat_data_filtered['tot_emp'].isin(broad_level_list_one['tot_emp'])].sort_values(by='tot_emp', ascending=False)

nat_data_filtered[nat_data_filtered['tot_emp'].isin(broad_level_list_one['tot_emp'])].shape

(35, 7)

# Create temporary drop and count markers for the function
count_marker = 'ct'
drop_marker = 'drop'

# Add the temporary columns to the filtered dataset

nat_data_filtered[count_marker] = 0
nat_data_filtered[drop_marker] = False

# Checks a df first by initial grouping, and then subgrouping to discern duplicates
# Updates provided df in place
# Requires initial dataframe have columns for marking counting within subgroups and a drop marker
def check_unique_within_subgroup(
    df,
    fir_grp,
    sec_grp,
    threshold,
    count_marker,
    drop_marker
):    
    # Iterate through the first group and separate by fir_grp
    for grp_num, group_df in df.groupby(fir_grp):
        
        # Debug:
        # print(f"Checking group: {grp_num}")

        # Iterate through sec_grp, count values, and set count of each to count marker column
        group_df[count_marker] = group_df.groupby(sec_grp)[sec_grp].transform('count')
        
        # In the drop_marker col, indicate whether this passes the threshold
        group_df[drop_marker] = group_df[count_marker] == threshold
        # print(group_df)

        # For loop to iterate through each value in the subgroup
        # and, in the parent df, set the associated count_marker and drop_marker
        for grp_idx, grp_row in group_df.iterrows():

            # Debug:
            # print(f"\ngrp_idx: {grp_idx},\ncount_marker: {grp_row[count_marker]},\ndrop_marker: {grp_row[drop_marker]}")
            
            # Debug:
            # print(f"\ndf tot_emp: {df.loc[grp_idx, fir_grp]}\ndf curr count: {df.loc[grp_idx, count_marker]}")

            # Set the count_marker in the parent df
            df.loc[grp_idx, count_marker] = grp_row[count_marker]

            # Set the drop_marker in the parent df
            df.loc[grp_idx, drop_marker] = grp_row[drop_marker]

# Call the custom function on nat_data_filtered
check_unique_within_subgroup(nat_data_filtered, 'tot_emp', 'emp_prse', threshold, count_marker, drop_marker)

# Check result
with pd.option_context('display.max_rows', None, 'display.max_colwidth', 32):
    print(nat_data_filtered[['occ_title', 'emp_prse', 'tot_emp', count_marker, drop_marker, 'a_mean']])

                            occ_title  emp_prse  tot_emp  ct   drop  a_mean
835   Supervisors of Office and Ad...       0.4  1495580   2  False   71560
836   First-Line Supervisors of Of...       0.4  1495580   2  False   71560
793   First-Line Supervisors of Sa...       0.4  1332180   2  False   59830
792      Supervisors of Sales Workers       0.4  1332180   2  False   59830
815   Sales Representatives of Ser...       0.7  1189330   2  False   81260
814   Miscellaneous Sales Represen...       0.7  1189330   2  False   81260
106   Miscellaneous Business Opera...       0.6  1128200   2  False   92380
107   Business Operations Speciali...       0.6  1128200   2  False   92380
968   Supervisors of Construction ...       0.6   806080   2  False   84500
969   First-Line Supervisors of Co...       0.6   806080   2  False   84500
1147  First-Line Supervisors of Pr...       0.3   685140   2  False   74540
1146  Supervisors of Production Wo...       0.3   685140   2  False   74540
73                Managers, All Other       0.5   630980   2  False  149890
72             Miscellaneous Managers       0.5   630980   2  False  149890
1313  Supervisors of Transportatio...       0.4   615680   2  False   65680
1314  First-Line Supervisors of Tr...       0.4   615680   2  False   65680
1072  First-Line Supervisors of Me...       0.5   600680   2  False   82930
1071  Supervisors of Installation,...       0.5   600680   2  False   82930
154   Computer Occupations, All Other       1.0   439380   2  False  116700
153   Miscellaneous Computer Occup...       1.0   439380   2  False  116700
714   Supervisors of Building and ...       1.4   298790   2  False   54580
1188                       Machinists       1.0   298790   1   True   57390
715   First-Line Supervisors of Bu...       1.4   298790   2  False   54580
941   Miscellaneous Office and Adm...       1.5   195890   2  False   48700
942   Office and Administrative Su...       1.5   195890   2  False   48700
423                            Tutors       1.8   174660   1   True   47780
716   First-Line Supervisors of Ho...       1.4   174660   1   True   51170
210              Engineers, All Other       1.2   150750   2  False  121720
209           Miscellaneous Engineers       1.2   150750   2  False  121720
896   Information and Record Clerk...       0.9   143910   2  False   50200
895   Miscellaneous Information an...       0.9   143910   2  False   50200
1170  Meat, Poultry, and Fish Cutt...       1.2   141090   1   True   38640
21                Facilities Managers       0.7   141090   1   True  114520
131   Financial Specialists, All O...       1.3   127450   2  False   93890
130   Miscellaneous Financial Spec...       1.3   127450   2  False   93890
425   Miscellaneous Teachers and I...       4.1   125010   2  False   71610
426   Teachers and Instructors, Al...       4.1   125010   2  False   71610
445   Educational Instruction and ...       1.1   114640   2  False   55700
444   Miscellaneous Educational In...       1.1   114640   2  False   55700
491   Miscellaneous Media and Comm...       2.8    89580   1   True   71560
711   Miscellaneous Food Preparati...       5.2    89580   2  False   35990
712   Food Preparation and Serving...       5.2    89580   2  False   35990
1241  Cabinetmakers and Bench Carp...       1.7    79540   1   True   47460
1126  Precision Instrument and Equ...       2.2    79540   1   True   65560
1352     Water Transportation Workers       2.4    77710   1   True   80150
1206  Miscellaneous Metal Workers ...       1.8    77710   1   True   47500
312    Marriage and Family Therapists       4.0    65870   1   True   72720
173   Surveyors, Cartographers, an...       2.4    65870   1   True   77920
822   Demonstrators and Product Pr...       5.4    64770   1   True   43730
145               Database Architects       2.7    64770   1   True  142620
789   Miscellaneous Personal Care ...       4.6    62390   2  False   37990
790   Personal Care and Service Wo...       4.6    62390   2  False   37990
503   Television, Video, and Film ...       3.8    53330   1   True   81850
47    Education Administrators, Al...       1.9    53330   1   True   99460
1339  Motor Vehicle Operators, All...       3.1    50330   2  False   40980
1338  Miscellaneous Motor Vehicle ...       3.1    50330   2  False   40980
520                      Optometrists       2.7    41890   1   True  140940
550                 Anesthesiologists       7.9    41890   1   True  336640
483   News Analysts, Reporters and...       2.2    41550   2  False  106030
484   News Analysts, Reporters, an...       2.2    41550   2  False  106030
860    Miscellaneous Financial Clerks       3.0    37030   2  False   55330
861       Financial Clerks, All Other       3.0    37030   2  False   55330
282   Social Scientists and Relate...       1.8    36970   1   True  106440
609   Healthcare Practitioners and...       1.9    36970   1   True   73200
944   Supervisors of Farming, Fish...       2.4    29530   2  False   63360
945   First-Line Supervisors of Fa...       2.4    29530   2  False   63360
1272  Crushing, Grinding, and Poli...       3.3    28550   1   True   49400
284   Agricultural and Food Scienc...       3.0    28550   1   True   52030
1401  Miscellaneous Material Movin...       4.4    25190   2  False   46650
1402  Material Moving Workers, All...       4.4    25190   2  False   46650
494   Media and Communication Work...       7.7    23590   1   True   84870
692                  Cooks, All Other       7.4    23590   1   True   38000
262    Physical Scientists, All Other       2.4    22580   2  False  123070
261   Miscellaneous Physical Scien...       2.4    22580   2  False  123070
1191  Metal-Refining Furnace Opera...       2.4    20330   1   True   57290
183                Chemical Engineers       3.7    20330   1   True  128430
1186  Lathe and Turning Machine To...       3.4    18970   1   True   50190
207               Petroleum Engineers       5.8    18970   1   True  153560
468   Athletes and Sports Competitors       7.3    14370   1   True  259750
233   Food Scientists and Technolo...       5.9    14370   1   True   92190
1058  Continuous Mining Machine Op...       3.6    14340   1   True   63920
285          Agricultural Technicians       4.9    14340   1   True   49680
406   Career/Technical Education T...       3.6    14200   1   True   68690
286          Food Science Technicians       3.0    14200   1   True   54400
61    Postmasters and Mail Superin...       0.0    13810   1   True   93760
1187  Milling and Planing Machine ...       4.8    13810   1   True   52940
507   Media and Communication Equi...       9.9    13020   2  False   78350
506   Miscellaneous Media and Comm...       9.9    13020   2  False   78350
222   Environmental Engineering Te...       6.4    12500   1   True   63070
995                            Tapers       7.6    12500   1   True   70390
1303  Adhesive Bonding Machine Ope...       3.4    12170   1   True   45670
1153  Coil Winders, Tapers, and Fi...       4.0    12170   1   True   48940
333      Religious Workers, All Other       9.0    12170   2  False   49830
332   Miscellaneous Religious Workers       9.0    12170   2  False   49830
1374  Miscellaneous Transportation...       4.7    10960   2  False   45040
1375  Transportation Workers, All ...       4.7    10960   2  False   45040
672   Gambling Surveillance Office...       2.1    10000   1   True   46710
451   Fine Artists, Including Pain...       7.7    10000   1   True   76450
462              Designers, All Other       9.7     9680   1   True   78000
1131  Precision Instrument and Equ...       4.9     9680   1   True   70420
472                           Dancers       9.6     9060   1   True       *
218   Aerospace Engineering and Op...       4.3     9060   1   True   86330
197   Marine Engineers and Naval A...       5.2     8440   1   True  116680
569                    Acupuncturists      19.5     8440   1   True   89750
247        Life Scientists, All Other       5.2     7320   2  False  101940
246     Miscellaneous Life Scientists       5.2     7320   2  False  101940
1251        Miscellaneous Woodworkers       6.5     6590   2  False   43730
1252           Woodworkers, All Other       6.5     6590   2  False   43730
1305  Cooling and Freezing Equipme...       3.5     6590   1   True   44290
1068  Miscellaneous Extraction Wor...       8.5     6070   2  False   54780
1069    Extraction Workers, All Other       8.5     6070   2  False   54780
517   Dentists, All Other Specialists      10.7     5900   1   True  246530
939            Statistical Assistants       5.7     5900   1   True   55470
1129  Musical Instrument Repairers...       5.2     5730   1   True   49020
1210  Tool Grinders, Filers, and S...       4.8     5730   1   True   51790
260                      Hydrologists       3.9     5720   1   True   98130
1255  Nuclear Power Reactor Operators       1.0     5720   1   True  122830
166   Miscellaneous Mathematical S...       6.5     4660   2  False   84700
167   Mathematical Science Occupat...       6.5     4660   2  False   84700
473                    Choreographers       9.4     3430   1   True   62020
1138                Commercial Divers      12.6     3430   1   True   78110
273                      Sociologists       7.2     2950   1   True  111670
760               Crematory Operators       5.7     2950   1   True   44790
1350  Miscellaneous Rail Transport...      10.3     1520   2  False   53500
1351  Rail Transportation Workers,...      10.3     1520   2  False   53500
1005                     Paperhangers      13.7     1520   1   True   51900
844   Communications Equipment Ope...       9.9     1390   2  False   55890
843   Miscellaneous Communications...       9.9     1390   2  False   55890
566                Pediatric Surgeons      23.4     1050   1   True  450810
269   Industrial-Organizational Ps...      24.7     1050   1   True  134400

with pd.option_context('display.max_rows',None):
    print(nat_data_filtered[nat_data_filtered[drop_marker] == True].sort_values(by='tot_emp',ascending=False))

     occ_code                                          occ_title   o_group  \
1188  51-4040                                         Machinists     broad   
423   25-3040                                             Tutors     broad   
716   37-1011  First-Line Supervisors of Housekeeping and Jan...  detailed   
1170  51-3022       Meat, Poultry, and Fish Cutters and Trimmers  detailed   
21    11-3013                                Facilities Managers  detailed   
491   27-3090      Miscellaneous Media and Communication Workers     broad   
1241  51-7010                 Cabinetmakers and Bench Carpenters     broad   
1126  49-9060       Precision Instrument and Equipment Repairers     broad   
1352  53-5000                       Water Transportation Workers     minor   
1206  51-4190    Miscellaneous Metal Workers and Plastic Workers     broad   
312   21-1013                     Marriage and Family Therapists  detailed   
173   17-1020    Surveyors, Cartographers, and Photogrammetrists     broad   
822   41-9011                Demonstrators and Product Promoters  detailed   
145   15-1243                                Database Architects  detailed   
47    11-9039                Education Administrators, All Other  detailed   
503   27-4030  Television, Video, and Film Camera Operators a...     broad   
520   29-1040                                       Optometrists     broad   
550   29-1211                                  Anesthesiologists  detailed   
282   19-3099   Social Scientists and Related Workers, All Other  detailed   
609   29-9099  Healthcare Practitioners and Technical Workers...  detailed   
1272  51-9021  Crushing, Grinding, and Polishing Machine Sett...  detailed   
284   19-4010          Agricultural and Food Science Technicians     broad   
494   27-3099         Media and Communication Workers, All Other  detailed   
692   35-2019                                   Cooks, All Other  detailed   
1191  51-4051       Metal-Refining Furnace Operators and Tenders  detailed   
183   17-2040                                 Chemical Engineers     broad   
1186  51-4034  Lathe and Turning Machine Tool Setters, Operat...  detailed   
207   17-2170                                Petroleum Engineers     broad   
468   27-2021                    Athletes and Sports Competitors  detailed   
233   19-1012                  Food Scientists and Technologists  detailed   
285   19-4012                           Agricultural Technicians  detailed   
1058  47-5041                Continuous Mining Machine Operators  detailed   
286   19-4013                           Food Science Technicians  detailed   
406   25-2023  Career/Technical Education Teachers, Middle Sc...  detailed   
61    11-9130               Postmasters and Mail Superintendents     broad   
1187  51-4035  Milling and Planing Machine Setters, Operators...  detailed   
222   17-3025  Environmental Engineering Technologists and Te...  detailed   
995   47-2082                                             Tapers  detailed   
1303  51-9191     Adhesive Bonding Machine Operators and Tenders  detailed   
1153  51-2021                Coil Winders, Tapers, and Finishers  detailed   
672   33-9031  Gambling Surveillance Officers and Gambling In...  detailed   
451   27-1013  Fine Artists, Including Painters, Sculptors, a...  detailed   
462   27-1029                               Designers, All Other  detailed   
1131  49-9069  Precision Instrument and Equipment Repairers, ...  detailed   
472   27-2031                                            Dancers  detailed   
218   17-3021  Aerospace Engineering and Operations Technolog...  detailed   
569   29-1291                                     Acupuncturists  detailed   
197   17-2120              Marine Engineers and Naval Architects     broad   
1305  51-9193  Cooling and Freezing Equipment Operators and T...  detailed   
517   29-1029                    Dentists, All Other Specialists  detailed   
939   43-9110                             Statistical Assistants     broad   
1129  49-9063            Musical Instrument Repairers and Tuners  detailed   
1210  51-4194              Tool Grinders, Filers, and Sharpeners  detailed   
260   19-2043                                       Hydrologists  detailed   
1255  51-8011                    Nuclear Power Reactor Operators  detailed   
473   27-2032                                     Choreographers  detailed   
1138  49-9092                                  Commercial Divers  detailed   
273   19-3040                                       Sociologists     broad   
760   39-4012                                Crematory Operators  detailed   
1005  47-2142                                       Paperhangers  detailed   
566   29-1243                                 Pediatric Surgeons  detailed   
269   19-3032            Industrial-Organizational Psychologists  detailed   

      tot_emp  emp_prse  h_mean  a_mean  ct  drop  
1188   298790       1.0   27.59   57390   1  True  
423    174660       1.8   22.97   47780   1  True  
716    174660       1.4    24.6   51170   1  True  
1170   141090       1.2   18.58   38640   1  True  
21     141090       0.7   55.06  114520   1  True  
491     89580       2.8    34.4   71560   1  True  
1241    79540       1.7   22.82   47460   1  True  
1126    79540       2.2   31.52   65560   1  True  
1352    77710       2.4   38.54   80150   1  True  
1206    77710       1.8   22.84   47500   1  True  
312     65870       4.0   34.96   72720   1  True  
173     65870       2.4   37.46   77920   1  True  
822     64770       5.4   21.03   43730   1  True  
145     64770       2.7   68.57  142620   1  True  
47      53330       1.9   47.82   99460   1  True  
503     53330       3.8   39.35   81850   1  True  
520     41890       2.7   67.76  140940   1  True  
550     41890       7.9  161.85  336640   1  True  
282     36970       1.8   51.17  106440   1  True  
609     36970       1.9   35.19   73200   1  True  
1272    28550       3.3   23.75   49400   1  True  
284     28550       3.0   25.01   52030   1  True  
494     23590       7.7   40.81   84870   1  True  
692     23590       7.4   18.27   38000   1  True  
1191    20330       2.4   27.55   57290   1  True  
183     20330       3.7   61.75  128430   1  True  
1186    18970       3.4   24.13   50190   1  True  
207     18970       5.8   73.83  153560   1  True  
468     14370       7.3       *  259750   1  True  
233     14370       5.9   44.32   92190   1  True  
285     14340       4.9   23.88   49680   1  True  
1058    14340       3.6   30.73   63920   1  True  
286     14200       3.0   26.15   54400   1  True  
406     14200       3.6       *   68690   1  True  
61      13810       0.0   45.08   93760   1  True  
1187    13810       4.8   25.45   52940   1  True  
222     12500       6.4   30.32   63070   1  True  
995     12500       7.6   33.84   70390   1  True  
1303    12170       3.4   21.96   45670   1  True  
1153    12170       4.0   23.53   48940   1  True  
672     10000       2.1   22.46   46710   1  True  
451     10000       7.7   36.76   76450   1  True  
462      9680       9.7    37.5   78000   1  True  
1131     9680       4.9   33.86   70420   1  True  
472      9060       9.6    30.3       *   1  True  
218      9060       4.3    41.5   86330   1  True  
569      8440      19.5   43.15   89750   1  True  
197      8440       5.2    56.1  116680   1  True  
1305     6590       3.5   21.29   44290   1  True  
517      5900      10.7  118.52  246530   1  True  
939      5900       5.7   26.67   55470   1  True  
1129     5730       5.2   23.57   49020   1  True  
1210     5730       4.8    24.9   51790   1  True  
260      5720       3.9   47.18   98130   1  True  
1255     5720       1.0   59.05  122830   1  True  
473      3430       9.4   29.82   62020   1  True  
1138     3430      12.6   37.55   78110   1  True  
273      2950       7.2   53.69  111670   1  True  
760      2950       5.7   21.53   44790   1  True  
1005     1520      13.7   24.95   51900   1  True  
566      1050      23.4  216.74  450810   1  True  
269      1050      24.7   64.62  134400   1  True

nat_data_filtered[nat_data_filtered[drop_marker] == True].shape

(62, 9)

nat_data_filtered[nat_data_filtered[drop_marker] == True]['tot_emp'].value_counts()

tot_emp
14340     2
174660    2
2950      2
3430      2
5720      2
5730      2
5900      2
8440      2
9060      2
9680      2
10000     2
12170     2
12500     2
13810     2
14200     2
1050      2
14370     2
53330     2
141090    2
79540     2
77710     2
65870     2
18970     2
64770     2
41890     2
36970     2
28550     2
23590     2
20330     2
6590      1
89580     1
1520      1
298790    1
Name: count, dtype: int64

nat_data_filtered = nat_data_filtered[nat_data_filtered[drop_marker] == False]

nat_data_filtered

nat_data_pre_drop = nat_data.drop(nat_data_filtered.index)

nat_data_pre_drop.head()

nat_data_pre_drop.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1070 entries, 0 to 1399
Data columns (total 28 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   area         1070 non-null   int64  
 1   area_title   1070 non-null   object 
 2   area_type    1070 non-null   int64  
 3   prim_state   1070 non-null   object 
 4   naics        1070 non-null   int64  
 5   naics_title  1070 non-null   object 
 6   i_group      1070 non-null   object 
 7   own_code     1070 non-null   int64  
 8   occ_code     1070 non-null   object 
 9   occ_title    1070 non-null   object 
 10  o_group      1070 non-null   object 
 11  tot_emp      1070 non-null   int64  
 12  emp_prse     1070 non-null   float64
 13  h_mean       1070 non-null   object 
 14  a_mean       1070 non-null   object 
 15  mean_prse    1070 non-null   float64
 16  h_pct10      1070 non-null   object 
 17  h_pct25      1070 non-null   object 
 18  h_median     1070 non-null   object 
 19  h_pct75      1070 non-null   object 
 20  h_pct90      1070 non-null   object 
 21  a_pct10      1070 non-null   object 
 22  a_pct25      1070 non-null   object 
 23  a_median     1070 non-null   object 
 24  a_pct75      1070 non-null   object 
 25  a_pct90      1070 non-null   object 
 26  annual       78 non-null     object 
 27  hourly       7 non-null      object 
dtypes: float64(2), int64(5), object(21)
memory usage: 242.4+ KB

nat_data_pre_drop.shape

(1070, 28)

1138 - 68

1070

nat_data_pre_drop['tot_emp'].duplicated().sum()

np.int64(29)

nat_data_pre_drop[nat_data_pre_drop['tot_emp'].duplicated() == True]

nat_data = nat_data_pre_drop

list_unique = nat_data['emp_prse'].unique()

temp_df = pd.DataFrame()
temp_df['emp_prse_count'] = pd.DataFrame(list_unique)

temp_df.sort_index(ascending=False)

temp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   emp_prse_count  130 non-null    float64
dtypes: float64(1)
memory usage: 1.1 KB

with pd.option_context('max_colwidth', None):
    print(nat_desc.loc[nat_desc['Field'] == 'emp_prse', 'Field Description'])

12    Percent relative standard error (PRSE) for the employment estimate. PRSE is a measure of sampling error, expressed as a percentage of the corresponding estimate. Sampling error occurs when values for a population are estimated from a sample survey of the population, rather than calculated from data for all members of the population. Estimates with lower PRSEs are typically more precise in the presence of sampling error.
Name: Field Description, dtype: object

list_unique = nat_data['emp_prse'].unique()

temp_df = pd.DataFrame()
temp_df['mean_prse_count'] = pd.DataFrame(list_unique)

temp_df.sort_index(ascending=False)

temp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   mean_prse_count  130 non-null    float64
dtypes: float64(1)
memory usage: 1.1 KB

with pd.option_context('max_colwidth', None):
    print(nat_desc.loc[nat_desc['Field'] == 'mean_prse', 'Field Description'])

19    Percent relative standard error (PRSE) for the mean wage estimate. PRSE is a measure of sampling error, expressed as a percentage of the corresponding estimate. Sampling error occurs when values for a population are estimated from a sample survey of the population, rather than calculated from data for all members of the population. Estimates with lower PRSEs are typically more precise in the presence of sampling error.
Name: Field Description, dtype: object

temp_df.sort_index(ascending=False)

col_hr_an = [
    'h_mean',
    'a_mean',
    'mean_prse',
    'h_pct10',
    'h_pct25',
    'h_median',
    'h_pct75',
    'h_pct90',
    'a_pct10',
    'a_pct25',
    'a_median',
    'a_pct75',
    'a_pct90',
    'annual',
    'hourly'
]

nat_data[col_hr_an].info()

<class 'pandas.core.frame.DataFrame'>
Index: 1070 entries, 0 to 1399
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   h_mean     1070 non-null   object 
 1   a_mean     1070 non-null   object 
 2   mean_prse  1070 non-null   float64
 3   h_pct10    1070 non-null   object 
 4   h_pct25    1070 non-null   object 
 5   h_median   1070 non-null   object 
 6   h_pct75    1070 non-null   object 
 7   h_pct90    1070 non-null   object 
 8   a_pct10    1070 non-null   object 
 9   a_pct25    1070 non-null   object 
 10  a_median   1070 non-null   object 
 11  a_pct75    1070 non-null   object 
 12  a_pct90    1070 non-null   object 
 13  annual     78 non-null     object 
 14  hourly     7 non-null      object 
dtypes: float64(1), object(14)
memory usage: 133.8+ KB

nat_data[col_hr_an].head(10)

nat_data['h_pct90'][0]

60.44

print(type(nat_data['h_pct90'][0]))

<class 'float'>

nat_data['h_pct90'][1]

'#'

print(type(nat_data['h_pct90'][1]))

<class 'str'>

hr_an_desc = pd.DataFrame(nat_desc.loc[nat_desc['Field'].isin(col_hr_an),['Field', 'Field Description']])

with pd.option_context('max_colwidth', None):
    print(hr_an_desc)

        Field  \
17     h_mean   
18     a_mean   
19  mean_prse   
20    h_pct10   
21    h_pct25   
22   h_median   
23    h_pct75   
24    h_pct90   
25    a_pct10   
26    a_pct25   
27   a_median   
28    a_pct75   
29    a_pct90   
30     annual   
31     hourly   

                                                                                                                                                                                                                                                                                                                                                                                                                         Field Description  
17                                                                                                                                                                                                                                                                                                                                                                                                                        Mean hourly wage  
18                                                                                                                                                                                                                                                                                                                                                                                                                       Mean annual wage   
19  Percent relative standard error (PRSE) for the mean wage estimate. PRSE is a measure of sampling error, expressed as a percentage of the corresponding estimate. Sampling error occurs when values for a population are estimated from a sample survey of the population, rather than calculated from data for all members of the population. Estimates with lower PRSEs are typically more precise in the presence of sampling error.  
20                                                                                                                                                                                                                                                                                                                                                                                                             Hourly 10th percentile wage  
21                                                                                                                                                                                                                                                                                                                                                                                                             Hourly 25th percentile wage  
22                                                                                                                                                                                                                                                                                                                                                                                             Hourly median wage (or the 50th percentile)  
23                                                                                                                                                                                                                                                                                                                                                                                                             Hourly 75th percentile wage  
24                                                                                                                                                                                                                                                                                                                                                                                                             Hourly 90th percentile wage  
25                                                                                                                                                                                                                                                                                                                                                                                                             Annual 10th percentile wage  
26                                                                                                                                                                                                                                                                                                                                                                                                             Annual 25th percentile wage  
27                                                                                                                                                                                                                                                                                                                                                                                             Annual median wage (or the 50th percentile)  
28                                                                                                                                                                                                                                                                                                                                                                                                             Annual 75th percentile wage  
29                                                                                                                                                                                                                                                                                                                                                                                                             Annual 90th percentile wage  
30                                                                                                                                                                                       Contains "TRUE" if only annual wages are released. The OEWS program releases only annual wages for some occupations that typically work fewer than 2,080 hours per year, but are paid on an annual basis, such as teachers, pilots, and athletes.  
31                                                                                                                                                                            Contains "TRUE" if only hourly wages are released. The OEWS program releases only hourly wages for some occupations that typically work fewer than 2,080 hours per year and are paid on an hourly basis, such as actors, dancers, and musicians and singers.

with pd.option_context('max_colwidth', None):
    print(notes_df)

                                                                                         Notes
34                                        *  = indicates that a wage estimate is not available
35                                **  = indicates that an employment estimate is not available
36       #  = indicates a wage equal to or greater than $115.00 per hour or $239,200 per year 
37  ~ =indicates that the percent of establishments reporting the occupation is less than 0.5%

nat_data['h_pct90'].apply(type).unique()

array([<class 'float'>, <class 'str'>, <class 'int'>], dtype=object)

nat_data['h_pct90'][nat_data['h_pct90'].apply(type) == str].unique()

array(['#', '*'], dtype=object)

nat_data['h_pct90'][nat_data['h_pct90'].apply(type) == int].unique()

array([110, 86, 61, 40, 60, 50, 20, 29, 54, 23, 35, 33], dtype=object)

nat_data['h_pct90'][nat_data['h_pct90'].apply(type) == int].value_counts()

h_pct90
40     4
110    1
86     1
61     1
60     1
50     1
20     1
29     1
54     1
23     1
35     1
33     1
Name: count, dtype: int64

nat_num_only = pd.DataFrame(nat_data['h_pct90'][(nat_data['h_pct90'].apply(type) != str)])

nat_num_only['h_pct90'][nat_num_only['h_pct90'] > 99]

17      104.16
28      105.35
36      105.76
46      102.12
57      105.33
117     102.79
137      111.6
145     100.96
147      99.92
149     101.66
156      99.24
187     107.61
207        110
232     113.34
264     102.27
269     107.97
339     101.39
342     104.11
449     101.64
474     100.11
476     105.44
509     102.34
537     102.35
810     103.47
1322    101.16
Name: h_pct90, dtype: object

temp_df = nat_data[nat_data['h_pct90'] == '*']

temp_df[col_hr_an].head()

type(nat_data.loc[nat_data['h_pct90'] == '*','a_pct10'].values[0])

int

# Lambda logic to obtain a_pct90 instances of non-int values 
nat_data[nat_data['a_pct90'].apply(lambda x: isinstance(x, int)) == False].head(2)

nat_data[nat_data['a_pct90'].apply(lambda x: isinstance(x, int)) == False].shape

(67, 28)

# Logic to obtain h_pct90 is * and a_pct is not #
(nat_data['h_pct90'] == '*') & (nat_data['a_pct90'] != '#')

0       False
1       False
2       False
3       False
5       False
        ...  
1394    False
1395    False
1396    False
1397    False
1399    False
Length: 1070, dtype: bool

# Test that instances are found
((nat_data['h_pct90'] == '*') & (nat_data['a_pct90'] != '#')).sum()

np.int64(71)

# Combine the two together
nat_data[(nat_data['h_pct90'] == '*') & (nat_data['a_pct90'] != '#') & (nat_data['a_pct90'].apply(lambda x: isinstance(x, int)) == False)]

nat_data[(nat_data['h_mean'] == '*') & (nat_data['a_mean'] == '*')]

nat_data[(nat_data['h_mean'].apply(lambda x: isinstance(x, int)) == True) & (nat_data['a_mean'].apply(lambda x: isinstance(x, int)) == True)].shape

(10, 28)

temp_df = nat_data[(nat_data['h_mean'].apply(lambda x: isinstance(x, int)) == True) & (nat_data['a_mean'].apply(lambda x: isinstance(x, int)) == True)]

temp_col = ['occ_title'] + col_hr_an
temp_df[temp_col]

nat_data_mask = nat_data[nat_data.isin(['~']).any(axis=1)]

nat_data_mask.shape

(0, 28)

nat_data_max = nat_data[nat_data.isin(['**']).any(axis=1)]

nat_data_mask.shape

(0, 28)

# List of columns to drop from the filtered dataset for simplicity
an_hr_col_drop = [
    'area',
    'area_title',
    'area_type',
    'prim_state',
    'naics',
    'naics_title',
    'i_group',
    'own_code',
    'occ_code',
    'o_group',
    'tot_emp',
    'emp_prse'
]

# Filter the primary dataset
nat_data_filtered = nat_data.drop(columns=an_hr_col_drop, axis=1)

# Capture a map of the types in the h_mean column
h_mean_df = pd.DataFrame(nat_data_filtered['h_mean'].map(type))

# Capture the index of all rows of h_mean that are of type str
h_mean_str_idx = h_mean_df[h_mean_df['h_mean'] == str].index

# Capture all str values in the h_mean col
nat_data_h_mean_str = pd.DataFrame(nat_data_filtered['h_mean'].loc[h_mean_str_idx])

nat_data_h_mean_str['h_mean'].value_counts()

h_mean
*    78
Name: count, dtype: int64

# Capture the index of all `*` cells
h_mean_star_idx = nat_data_h_mean_str[nat_data_h_mean_str['h_mean'] == '*'].index

print(len(h_mean_star_idx))

78

# Set each of these cells to `NaN`
nat_data_filtered.loc[h_mean_star_idx, 'h_mean'] = np.nan

# Check that no type str values remain in the column
nat_data_filtered[nat_data_filtered['h_mean'].apply(type) == str].shape

(0, 16)

# Check that each of the targeted cells is NaN 
nat_data_filtered['h_mean'].loc[h_mean_star_idx]

7       NaN
45      NaN
350     NaN
351     NaN
353     NaN
       ... 
1317    NaN
1318    NaN
1319    NaN
1320    NaN
1324    NaN
Name: h_mean, Length: 78, dtype: object

nat_data_filtered['h_mean'] = nat_data_filtered['h_mean'].astype(float)

nat_data_filtered['h_mean'].describe()

count    992.000000
mean      36.408649
std       23.397692
min       14.820000
25%       22.847500
50%       30.115000
75%       41.217500
max      216.740000
Name: h_mean, dtype: float64

nat_data_filtered['h_mean'].isna().sum()

np.int64(78)

nat_data['h_mean'] = nat_data_filtered['h_mean']

nat_data[nat_data['h_mean'].apply(type) == str].shape

(0, 28)

nat_data[nat_data['h_mean'].apply(type) == int].shape

(0, 28)

nat_data['h_mean'].isna().sum()

np.int64(78)

nat_data['h_mean'].describe()

count    992.000000
mean      36.408649
std       23.397692
min       14.820000
25%       22.847500
50%       30.115000
75%       41.217500
max      216.740000
Name: h_mean, dtype: float64

# Filter the primary dataset
nat_data_filtered = nat_data.drop(columns=an_hr_col_drop, axis=1)

# Capture a map of the types in the a_mean column
a_mean_df = pd.DataFrame(nat_data_filtered['a_mean'].map(type))

# Capture the index of all rows of a_mean that are of type str
a_mean_str_idx = a_mean_df[a_mean_df['a_mean'] == str].index

# Capture all str values in the a_mean col
nat_data_a_mean_str = pd.DataFrame(nat_data_filtered['a_mean'].loc[a_mean_str_idx])

nat_data_a_mean_str['a_mean'].value_counts()

a_mean
*    7
Name: count, dtype: int64

# Capture the index of all `*` cells
a_mean_star_idx = nat_data_a_mean_str[nat_data_a_mean_str['a_mean'] == '*'].index

print(len(a_mean_star_idx))

7

# Set each of these cells to `NaN`
nat_data_filtered.loc[a_mean_star_idx, 'a_mean'] = np.nan

# Check that no type str values remain in the column
nat_data_filtered[nat_data_filtered['a_mean'].apply(type) == str].shape

(0, 16)

# Check that each of the targeted cells is NaN 
nat_data_filtered['a_mean'].loc[a_mean_star_idx]

465    NaN
472    NaN
474    NaN
476    NaN
477    NaN
478    NaN
479    NaN
Name: a_mean, dtype: object

nat_data_filtered['a_mean'] = nat_data_filtered['a_mean'].astype(float)

nat_data_filtered['a_mean'].describe()

count      1063.000000
mean      77213.170273
std       48508.978729
min       30830.000000
25%       48560.000000
50%       64220.000000
75%       89740.000000
max      450810.000000
Name: a_mean, dtype: float64

nat_data_filtered['a_mean'].isna().sum()

np.int64(7)

nat_data['a_mean'] = nat_data_filtered['a_mean']

nat_data[nat_data['a_mean'].apply(type) == str].shape

(0, 28)

nat_data[nat_data['a_mean'].apply(type) == int].shape

(0, 28)

nat_data['a_mean'].isna().sum()

np.int64(7)

nat_data['a_mean'].describe()

count      1063.000000
mean      77213.170273
std       48508.978729
min       30830.000000
25%       48560.000000
50%       64220.000000
75%       89740.000000
max      450810.000000
Name: a_mean, dtype: float64

with pd.option_context('display.max_colwidth', None):
    print(nat_desc[nat_desc['Field'] == 'mean_prse']['Field Description'])

19    Percent relative standard error (PRSE) for the mean wage estimate. PRSE is a measure of sampling error, expressed as a percentage of the corresponding estimate. Sampling error occurs when values for a population are estimated from a sample survey of the population, rather than calculated from data for all members of the population. Estimates with lower PRSEs are typically more precise in the presence of sampling error.
Name: Field Description, dtype: object

nat_data_filtered['mean_prse'].head()

0    0.1
1    0.2
2    0.4
3    0.9
5    0.4
Name: mean_prse, dtype: float64

nat_data_filtered['mean_prse'].describe()

count    1070.000000
mean        1.224206
std         1.773071
min         0.000000
25%         0.400000
50%         0.700000
75%         1.300000
max        22.100000
Name: mean_prse, dtype: float64

nat_data_filtered['mean_prse'].isna().sum()

np.int64(0)

len(nat_data_filtered[nat_data_filtered['mean_prse'] == 0])

5

nat_data_filtered[nat_data_filtered['mean_prse'] == 0]

with pd.option_context('display.max_colwidth', None):
    print(nat_desc[nat_desc['Field'] == 'h_pct10']['Field Description'])

20    Hourly 10th percentile wage
Name: Field Description, dtype: object

nat_data_filtered[nat_data_filtered['h_pct10'].apply(type) == str].shape

(78, 16)

nat_data_filtered[nat_data_filtered['h_pct10'].apply(type) == str].head(2)

nat_data_filtered[nat_data_filtered['h_pct10'] == '*'].shape

(78, 16)

# Filter the primary dataset
nat_data_filtered = nat_data.drop(columns=an_hr_col_drop, axis=1)

# Capture a map of the types in the h_pct10 column
h_pct10_df = pd.DataFrame(nat_data_filtered['h_pct10'].map(type))

# Capture the index of all rows of h_pct10 that are of type str
h_pct10_str_idx = h_pct10_df[h_pct10_df['h_pct10'] == str].index

# Capture all str values in the h_pct10 col
nat_data_h_pct10_str = pd.DataFrame(nat_data_filtered['h_pct10'].loc[h_pct10_str_idx])

nat_data_h_pct10_str['h_pct10'].value_counts()

h_pct10
*    78
Name: count, dtype: int64

# Capture the index of all `*` cells
h_pct10_star_idx = nat_data_h_pct10_str[nat_data_h_pct10_str['h_pct10'] == '*'].index

print(len(h_pct10_star_idx))

78

# Set each of these cells to `NaN`
nat_data_filtered.loc[h_pct10_star_idx, 'h_pct10'] = np.nan

# Check that no type str values remain in the column
nat_data_filtered[nat_data_filtered['h_pct10'].apply(type) == str].shape

(0, 16)

# Check that each of the targeted cells is NaN 
nat_data_filtered['h_pct10'].loc[h_pct10_star_idx]

7       NaN
45      NaN
350     NaN
351     NaN
353     NaN
       ... 
1317    NaN
1318    NaN
1319    NaN
1320    NaN
1324    NaN
Name: h_pct10, Length: 78, dtype: object

nat_data_filtered['h_pct10'] = nat_data_filtered['h_pct10'].astype(float)

nat_data_filtered['h_pct10'].describe()

count    992.000000
mean      20.477671
std        8.139964
min        8.890000
25%       15.352500
50%       17.955000
75%       22.955000
max       91.210000
Name: h_pct10, dtype: float64

nat_data_filtered['h_pct10'].isna().sum()

np.int64(78)

nat_data['h_pct10'] = nat_data_filtered['h_pct10']

nat_data[nat_data['h_pct10'].apply(type) == str].shape

(0, 28)

nat_data[nat_data['h_pct10'].apply(type) == int].shape

(0, 28)

nat_data['h_pct10'].isna().sum()

np.int64(78)

nat_data['h_pct10'].describe()

count    992.000000
mean      20.477671
std        8.139964
min        8.890000
25%       15.352500
50%       17.955000
75%       22.955000
max       91.210000
Name: h_pct10, dtype: float64

with pd.option_context('display.max_colwidth', None):
    print(nat_desc[nat_desc['Field'] == 'h_pct25']['Field Description'])

21    Hourly 25th percentile wage
Name: Field Description, dtype: object

nat_data_filtered[nat_data_filtered['h_pct25'].apply(type) == str].shape

(79, 16)

nat_data_filtered[nat_data_filtered['h_pct25'].apply(type) == str].head(2)

nat_data_filtered[nat_data_filtered['h_pct25'] == '*'].shape

(78, 16)

nat_data_filtered[nat_data_filtered['h_pct25'] == '#'].shape

(1, 16)

with pd.option_context('display.max_colwidth', None):
    print(notes_df)

                                                                                         Notes
34                                        *  = indicates that a wage estimate is not available
35                                **  = indicates that an employment estimate is not available
36       #  = indicates a wage equal to or greater than $115.00 per hour or $239,200 per year 
37  ~ =indicates that the percent of establishments reporting the occupation is less than 0.5%

# Filter the primary dataset
nat_data_filtered = nat_data.drop(columns=an_hr_col_drop, axis=1)

# Capture a map of the types in the h_pct25 column
h_pct25_df = pd.DataFrame(nat_data_filtered['h_pct25'].map(type))

# Capture the index of all rows of h_pct25 that are of type str
h_pct25_str_idx = h_pct25_df[h_pct25_df['h_pct25'] == str].index

# Capture all str values in the h_pct25 col
nat_data_h_pct25_str = pd.DataFrame(nat_data_filtered['h_pct25'].loc[h_pct25_str_idx])

nat_data_h_pct25_str['h_pct25'].value_counts()

h_pct25
*    78
#     1
Name: count, dtype: int64

# Capture the index of all `*` cells
h_pct25_star_idx = nat_data_h_pct25_str[nat_data_h_pct25_str['h_pct25'] == '*'].index

print(len(h_pct25_star_idx))

78

# Set each of these cells to `NaN`
nat_data_filtered.loc[h_pct25_star_idx, 'h_pct25'] = np.nan

# Check that `78` type str values are removed from the column
nat_data_filtered[nat_data_filtered['h_pct25'].apply(type) == str].shape

(1, 16)

# Capture the index of all `#` cells
h_pct25_hash_idx = nat_data_h_pct25_str[nat_data_h_pct25_str['h_pct25'] == '#'].index

print(len(h_pct25_hash_idx))

1

# Set each of these cells to `115`
nat_data_filtered.loc[h_pct25_hash_idx, 'h_pct25'] = 115

# Check that each of the targeted cells is NaN 
nat_data_filtered['h_pct25'].loc[h_pct25_star_idx].unique()

array([nan], dtype=object)

# Check that each of the targeted cells is 115 
nat_data_filtered['h_pct25'].loc[h_pct25_hash_idx].unique()

array([115], dtype=object)

nat_data_filtered['h_pct25'] = nat_data_filtered['h_pct25'].astype(float)

nat_data_filtered['h_pct25'].describe()

count    992.000000
mean      25.801018
std       13.398383
min       11.770000
25%       17.932500
50%       22.125000
75%       28.825000
max      115.000000
Name: h_pct25, dtype: float64

nat_data_filtered['h_pct25'].isna().sum()

np.int64(78)

nat_data['h_pct25'] = nat_data_filtered['h_pct25']

nat_data[nat_data['h_pct25'].apply(type) == str].shape

(0, 28)

nat_data[nat_data['h_pct25'].apply(type) == int].shape

(0, 28)

nat_data['h_pct25'].isna().sum()

np.int64(78)

nat_data['h_pct25'].describe()

count    992.000000
mean      25.801018
std       13.398383
min       11.770000
25%       17.932500
50%       22.125000
75%       28.825000
max      115.000000
Name: h_pct25, dtype: float64

with pd.option_context('display.max_colwidth', None):
    print(nat_desc[nat_desc['Field'] == 'h_median']['Field Description'])

22    Hourly median wage (or the 50th percentile)
Name: Field Description, dtype: object

nat_data_filtered[nat_data_filtered['h_median'].apply(type) == str].shape

(97, 16)

nat_data_filtered[nat_data_filtered['h_median'].apply(type) == str].head(2)

nat_data_filtered[nat_data_filtered['h_median'] == '*'].shape

(78, 16)

nat_data_filtered[nat_data_filtered['h_median'] == '#'].shape

(19, 16)

with pd.option_context('display.max_colwidth', None):
    print(notes_df)

                                                                                         Notes
34                                        *  = indicates that a wage estimate is not available
35                                **  = indicates that an employment estimate is not available
36       #  = indicates a wage equal to or greater than $115.00 per hour or $239,200 per year 
37  ~ =indicates that the percent of establishments reporting the occupation is less than 0.5%

# Filter the primary dataset
nat_data_filtered = nat_data.drop(columns=an_hr_col_drop, axis=1)

# Capture a map of the types in the h_median column
h_median_df = pd.DataFrame(nat_data_filtered['h_median'].map(type))

# Capture the index of all rows of h_median that are of type str
h_median_str_idx = h_median_df[h_median_df['h_median'] == str].index

# Capture all str values in the h_median col
nat_data_h_median_str = pd.DataFrame(nat_data_filtered['h_median'].loc[h_median_str_idx])

nat_data_h_median_str['h_median'].value_counts()

h_median
*    78
#    19
Name: count, dtype: int64

# Capture the index of all `*` cells
h_median_star_idx = nat_data_h_median_str[nat_data_h_median_str['h_median'] == '*'].index

print(len(h_median_star_idx))

78

# Set each of these cells to `NaN`
nat_data_filtered.loc[h_median_star_idx, 'h_median'] = np.nan

# Check that `78` type str values are removed from the column
nat_data_filtered[nat_data_filtered['h_median'].apply(type) == str].shape

(19, 16)

# Capture the index of all `#` cells
h_median_hash_idx = nat_data_h_median_str[nat_data_h_median_str['h_median'] == '#'].index

print(len(h_median_hash_idx))

19

# Set each of these cells to `115`
nat_data_filtered.loc[h_median_hash_idx, 'h_median'] = 115

# Check that each of the targeted cells is NaN 
nat_data_filtered['h_median'].loc[h_median_star_idx].unique()

array([nan], dtype=object)

# Check that each of the targeted cells is 115 
nat_data_filtered['h_median'].loc[h_median_hash_idx].unique()

array([115], dtype=object)

nat_data_filtered['h_median'] = nat_data_filtered['h_median'].astype(float)

nat_data_filtered['h_median'].describe()

count    992.000000
mean      32.838407
std       18.393770
min       14.500000
25%       21.725000
50%       27.780000
75%       37.580000
max      115.000000
Name: h_median, dtype: float64

nat_data_filtered['h_median'].isna().sum()

np.int64(78)

nat_data['h_median'] = nat_data_filtered['h_median']

nat_data[nat_data['h_median'].apply(type) == str].shape

(0, 28)

nat_data[nat_data['h_median'].apply(type) == int].shape

(0, 28)

nat_data['h_median'].isna().sum()

np.int64(78)

nat_data['h_median'].describe()

count    992.000000
mean      32.838407
std       18.393770
min       14.500000
25%       21.725000
50%       27.780000
75%       37.580000
max      115.000000
Name: h_median, dtype: float64

nat_data[nat_data['h_median'] == 115].shape

(19, 28)

# Take a column and 
def wrangle_a_h_col(
    nat_data,
    col_drop,
    col_wrangle,
    hash_replacement
):
    # Set the filtered dataset to be a subset of the primary dataset, dropping unnecessary columns 
    temp_df = nat_data.drop(columns=col_drop, axis=1)

    # Capture a map of the types in the target column
    col_df = pd.DataFrame(temp_df[col_wrangle].map(type))

    # Capture the index of all rows of col_wrangle that are of type str
    col_str_idx = col_df[col_df[col_wrangle] == str].index

    # Capture all str values in col_wrangle
    nat_data_col_str = pd.DataFrame(temp_df[col_wrangle].loc[col_str_idx])

    # Debug:
    print('str val counts:\n')
    print(nat_data_col_str[col_wrangle].value_counts())

    # Capture the index of all `*` cells
    col_star_idx = nat_data_col_str[nat_data_col_str[col_wrangle] == '*'].index

    # Debug:
    print(len(h_median_star_idx))

    # Set each of these cells to `NaN`
    temp_df.loc[col_star_idx, col_wrangle] = np.nan

    # Debug:
    print('Resulting shape:\n')
    print(temp_df[temp_df[col_wrangle].apply(type) == str].shape)

    # Capture the index of all `#` cells
    col_hash_idx = nat_data_col_str[nat_data_col_str[col_wrangle] == '#'].index

    # Debug:
    print(len(col_hash_idx))

    # Set each of these cells to hash_replacement
    temp_df.loc[col_hash_idx, col_wrangle] = hash_replacement

    # Debug:
    print('Check: each of the targeted cells is NaN:\n') 
    print(temp_df[col_wrangle].loc[col_star_idx].unique())

    # Set to type float
    temp_df[col_wrangle] = temp_df[col_wrangle].astype(float)

    # Debug:
    print('Describe:\n')
    print(temp_df[col_wrangle].describe())

    # Debug:
    print('Check NaN:\n')
    print(temp_df[col_wrangle].isna().sum())

    return temp_df

nat_data_filtered = wrangle_a_h_col(nat_data, an_hr_col_drop, 'h_pct75', 115)

str val counts:

h_pct75
*    78
#    26
Name: count, dtype: int64
78
Resulting shape:

(26, 16)
26
Check: each of the targeted cells is NaN:

[nan]
Describe:

count    992.000000
mean      41.339748
std       21.625436
min       17.000000
25%       26.125000
50%       35.965000
75%       48.940000
max      115.000000
Name: h_pct75, dtype: float64
Check NaN:

78

nat_data_filtered['h_pct75'].describe()

count    992.000000
mean      41.339748
std       21.625436
min       17.000000
25%       26.125000
50%       35.965000
75%       48.940000
max      115.000000
Name: h_pct75, dtype: float64

nat_data['h_pct75'] = nat_data_filtered['h_pct75']

nat_data_filtered = wrangle_a_h_col(nat_data, an_hr_col_drop, 'h_pct90', 115)

str val counts:

h_pct90
*    78
#    53
Name: count, dtype: int64
78
Resulting shape:

(53, 16)
53
Check: each of the targeted cells is NaN:

[nan]
Describe:

count    992.000000
mean      51.208468
std       25.278038
min       17.300000
25%       31.507500
50%       44.585000
75%       63.637500
max      115.000000
Name: h_pct90, dtype: float64
Check NaN:

78

nat_data_filtered['h_pct90'].describe()

count    992.000000
mean      51.208468
std       25.278038
min       17.300000
25%       31.507500
50%       44.585000
75%       63.637500
max      115.000000
Name: h_pct90, dtype: float64

nat_data['h_pct90'] = nat_data_filtered['h_pct90']

with pd.option_context('display.max_colwidth', None):
    print(notes_df)

                                                                                         Notes
34                                        *  = indicates that a wage estimate is not available
35                                **  = indicates that an employment estimate is not available
36       #  = indicates a wage equal to or greater than $115.00 per hour or $239,200 per year 
37  ~ =indicates that the percent of establishments reporting the occupation is less than 0.5%

nat_data_filtered = wrangle_a_h_col(nat_data, an_hr_col_drop, 'a_pct10', 239200)

str val counts:

a_pct10
*    7
Name: count, dtype: int64
78
Resulting shape:

(0, 16)
0
Check: each of the targeted cells is NaN:

[nan]
Describe:

count      1063.000000
mean      43009.416745
std       16659.119891
min       18500.000000
25%       32265.000000
50%       38030.000000
75%       48300.000000
max      189720.000000
Name: a_pct10, dtype: float64
Check NaN:

7

nat_data_filtered['a_pct10'].describe()

count      1063.000000
mean      43009.416745
std       16659.119891
min       18500.000000
25%       32265.000000
50%       38030.000000
75%       48300.000000
max      189720.000000
Name: a_pct10, dtype: float64

nat_data['a_pct10'] = nat_data_filtered['a_pct10']

nat_data_filtered = wrangle_a_h_col(nat_data, an_hr_col_drop, 'a_pct25', 239200)

str val counts:

a_pct25
*    7
#    1
Name: count, dtype: int64
78
Resulting shape:

(1, 16)
1
Check: each of the targeted cells is NaN:

[nan]
Describe:

count      1063.000000
mean      54261.674506
std       27360.720901
min       24480.000000
25%       37535.000000
50%       46950.000000
75%       62075.000000
max      239200.000000
Name: a_pct25, dtype: float64
Check NaN:

7

nat_data_filtered['a_pct25'].describe()

count      1063.000000
mean      54261.674506
std       27360.720901
min       24480.000000
25%       37535.000000
50%       46950.000000
75%       62075.000000
max      239200.000000
Name: a_pct25, dtype: float64

nat_data['a_pct25'] = nat_data_filtered['a_pct25']

nat_data_filtered = wrangle_a_h_col(nat_data, an_hr_col_drop, 'a_median', 239200)

str val counts:

a_median
#    19
*     7
Name: count, dtype: int64
78
Resulting shape:

(19, 16)
19
Check: each of the targeted cells is NaN:

[nan]
Describe:

count      1063.000000
mean      69326.698024
std       37891.227934
min       30160.000000
25%       45850.000000
50%       59280.000000
75%       80125.000000
max      239200.000000
Name: a_median, dtype: float64
Check NaN:

7

nat_data_filtered['a_median'].describe()

count      1063.000000
mean      69326.698024
std       37891.227934
min       30160.000000
25%       45850.000000
50%       59280.000000
75%       80125.000000
max      239200.000000
Name: a_median, dtype: float64

nat_data['a_median'] = nat_data_filtered['a_median']

nat_data_filtered = wrangle_a_h_col(nat_data, an_hr_col_drop, 'a_pct75', 239200)

str val counts:

a_pct75
#    28
*     7
Name: count, dtype: int64
78
Resulting shape:

(28, 16)
28
Check: each of the targeted cells is NaN:

[nan]
Describe:

count      1063.000000
mean      87751.072437
std       45028.967280
min       35360.000000
25%       56150.000000
50%       76820.000000
75%      104390.000000
max      239200.000000
Name: a_pct75, dtype: float64
Check NaN:

7

nat_data_filtered['a_pct75'].describe()

count      1063.000000
mean      87751.072437
std       45028.967280
min       35360.000000
25%       56150.000000
50%       76820.000000
75%      104390.000000
max      239200.000000
Name: a_pct75, dtype: float64

nat_data['a_pct75'] = nat_data_filtered['a_pct75']

nat_data_filtered = wrangle_a_h_col(nat_data, an_hr_col_drop, 'a_pct90', 239200)

str val counts:

a_pct90
#    60
*     7
Name: count, dtype: int64
78
Resulting shape:

(60, 16)
60
Check: each of the targeted cells is NaN:

[nan]
Describe:

count      1063.000000
mean     109360.479774
std       53278.632026
min       35970.000000
25%       67490.000000
50%       95750.000000
75%      137800.000000
max      239200.000000
Name: a_pct90, dtype: float64
Check NaN:

7

nat_data_filtered['a_pct90'].describe()

count      1063.000000
mean     109360.479774
std       53278.632026
min       35970.000000
25%       67490.000000
50%       95750.000000
75%      137800.000000
max      239200.000000
Name: a_pct90, dtype: float64

nat_data['a_pct90'] = nat_data_filtered['a_pct90']

nat_data_filtered = nat_data.drop(columns=an_hr_col_drop, axis=1)

nat_data_filtered['annual'].unique()

array([nan, True], dtype=object)

nat_data_filtered['annual'].value_counts()

annual
True    78
Name: count, dtype: int64

nat_data_filtered['annual'].isna().sum()

np.int64(992)

with pd.option_context('display.max_colwidth', None):
    print(nat_desc[nat_desc['Field'] == 'annual']['Field Description'])

30    Contains "TRUE" if only annual wages are released. The OEWS program releases only annual wages for some occupations that typically work fewer than 2,080 hours per year, but are paid on an annual basis, such as teachers, pilots, and athletes.
Name: Field Description, dtype: object

nat_data_filtered = nat_data.drop(columns=an_hr_col_drop, axis=1)

nat_data_filtered['hourly'].unique()

array([nan, True], dtype=object)

nat_data_filtered['hourly'].value_counts()

hourly
True    7
Name: count, dtype: int64

nat_data_filtered['hourly'].isna().sum()

np.int64(1063)

with pd.option_context('display.max_colwidth', None):
    print(nat_desc[nat_desc['Field'] == 'hourly']['Field Description'])

31    Contains "TRUE" if only hourly wages are released. The OEWS program releases only hourly wages for some occupations that typically work fewer than 2,080 hours per year and are paid on an hourly basis, such as actors, dancers, and musicians and singers.
Name: Field Description, dtype: object

bls_schema_name = 'bls'
nat_data_table_name = 'nat_data'

# Check if the bls schema exists. If not, create it.
# Adapted from a Google search, 'sqlalchemy how to create a schema'
with engine.connect() as conn:
    if not inspect(conn).has_schema(bls_schema_name):
        conn.execute(CreateSchema(bls_schema_name))
        conn.commit()
    else:
        print(f'{bls_schema_name} already exists')

bls already exists

# Check if a table with the indicated name exists
# If it does, check that it has a matching column and data type structure
# If it does not, send error and pause script
# As needed, create table in its entirety
# Adapted from the response of a google search, 'slqalchemy how to create a schema'
# Adapted from google search, 'sqlalchemy check if table exists within a schema'
# Adapted from Google search, 'sqlalchemy how to create column with big serial primary key'
    
# Create an inspector
inspector = inspect(engine)

# Check for the table in a specific schema
# If schema is None, it checks the default schema (e.g., 'public' in Postgres)
table_exists = inspector.has_table(nat_data_table_name, schema=bls_schema_name)

print(f"{nat_data_table_name} table exists: {table_exists}")

if not table_exists:
    class Base(DeclarativeBase):
        pass

    class User(Base):
        __tablename__ = nat_data_table_name
        # To place this table in a specific schema:
        __table_args__ = {"schema": schema_name} 
        
        id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
        area = Column(Integer)
        area_title = Column(String)
        area_type = Column(Integer)
        prim_state = Column(String)
        naics = Column(Integer)
        naics_title = Column(String)
        i_group = Column(String)
        own_code = Column(Integer)
        occ_code = Column(String)
        occ_title = Column(String)
        o_group = Column(String)
        tot_emp = Column(Integer)
        emp_prse = Column(Float)
        h_mean = Column(Float)
        a_mean = Column(Float)
        mean_prse = Column(Float)
        h_pct10 = Column(Float)
        h_pct25 = Column(Float)
        h_median = Column(Float)
        h_pct75 = Column(Float)
        h_pct90 = Column(Float)
        a_pct10 = Column(Float)
        a_pct25 = Column(Float)
        a_median = Column(Float)
        a_pct75 = Column(Float)
        a_pct90 = Column(Float)
        annual = Column(Boolean)
        hourly = Column(Boolean)
        
    
    # Create all tables defined in 'Base'
    Base.metadata.create_all(engine)

nat_data table exists: True

# Only run once
# If a need arises to reimport, drop the database using the PgAdmin4 interface and then reimport

# Insert each row from nat_data into the psql table
# nat_data.to_sql(
#    name=nat_data_table_name, 
#    con=engine, 
#    schema=bls_schema_name,
#    if_exists='append',      # options: 'fail', 'replace', 'append'
#    index=False
#)

with engine.connect() as conn:
    result = conn.execute(text("select count(*) from bls.nat_data"))
    print(result.fetchone())

(1070,)

	AREA	AREA_TITLE	AREA_TYPE	PRIM_STATE	NAICS_TITLE	I_GROUP	OWN_CODE	OCC_CODE	OCC_TITLE	...	H_MEDIAN	H_PCT75	H_PCT90	A_PCT10	A_PCT25	A_MEDIAN	A_PCT75	A_PCT90	ANNUAL	HOURLY
0	1	Alabama	2	AL	Cross-industry	cross-industry	1235	00-0000	All Occupations	...	21.07	30.82	47.51	23520	30660	43830	64110	98810	NaN	NaN
1	1	Alabama	2	AL	Cross-industry	cross-industry	1235	11-0000	Management Occupations	...	48.39	68.5	98.03	51100	72870	100640	142480	203900	NaN	NaN
2	1	Alabama	2	AL	Cross-industry	cross-industry	1235	11-1011	Chief Executives	...	79.04	106.69	#	104950	130950	164400	221910	#	NaN	NaN
3	1	Alabama	2	AL	Cross-industry	cross-industry	1235	11-1021	General and Operations Managers	...	51.12	78.26	#	50410	74720	106330	162780	#	NaN	NaN
4	1	Alabama	2	AL	Cross-industry	cross-industry	1235	11-1031	Legislators	...	*	*	*	18270	20950	26990	41760	63900	True	NaN

	h_mean	a_mean	mean_prse	h_pct10	h_pct25	h_median	h_pct75	h_pct90	a_pct10	a_pct25	a_median	a_pct75	a_pct90	annual	hourly
0	32.66	67920	0.1	14.42	17.66	23.8	37.89	60.44	29990	36730	49500	78810	125720	NaN	NaN
1	68.15	141760	0.2	27.41	38.42	58.7	82.5	#	57010	79900	122090	171610	#	NaN	NaN
2	67.24	139860	0.4	22.84	33.08	50.48	81.01	#	47510	68800	104990	168490	#	NaN	NaN
3	126.41	262930	0.9	35.44	60.61	99.24	#	#	73710	126080	206420	#	#	NaN	NaN
5	64	133120	0.4	22.8	32.29	49.5	78.91	#	47420	67160	102950	164130	#	NaN	NaN
7	*	67390	1.6	*	*	*	*	*	20380	29120	44810	80350	137820	True	NaN
9	78.72	163740	0.3	35.25	48.43	69.48	98.52	#	73320	100740	144530	204920	#	NaN	NaN
10	71.76	149270	2.3	30.29	41.34	61.04	85.85	#	63000	85990	126960	178570	#	NaN	NaN
12	79.35	165050	0.3	35.13	48.65	71.57	99.24	#	73080	101200	148870	206410	#	NaN	NaN
13	82.46	171520	0.6	39.38	53.47	77.42	101.48	#	81900	111210	161030	211080	#	NaN	NaN

	occ_title	h_mean	a_mean	mean_prse	h_pct10	h_pct25	h_median	h_pct75	h_pct90	a_pct10	a_pct25	a_median	a_pct75	a_pct90	annual	hourly
5	General and Operations Managers	64	133120	0.4	22.8	32.29	49.5	78.91	#	47420	67160	102950	164130	#	NaN	NaN
189	Electrical and Electronics Engineers	60	124810	0.6	36.63	44.42	57.11	72.37	87.21	76190	92390	118780	150530	181390	NaN	NaN
346	Miscellaneous Legal Support Workers	35	72800	1.0	18.35	22.57	29.11	39.13	62.75	38170	46950	60550	81390	130510	NaN	NaN
791	Sales and Related Occupations	26	54070	0.3	13.01	14.91	18.01	28.79	47.53	27060	31010	37460	59880	98860	NaN	NaN
920	Legal Secretaries and Administrative Assistants	29	60320	2.6	17.08	20.54	26.03	34.66	42.15	35530	42720	54140	72090	87660	NaN	NaN
924	Data Entry and Information Processing Workers	21	43680	0.5	15	17.19	19.94	23.7	28.47	31200	35760	41480	49300	59220	NaN	NaN
950	Animal Breeders	27	56150	5.4	17.85	20.82	25	28.67	43.26	37130	43310	52000	59630	89970	NaN	NaN
1121	Millwrights	33	68640	1.0	21.68	26.65	31.33	38.74	45.07	45100	55420	65170	80580	93740	NaN	NaN
1266	Plant and System Operators, All Other	31	64490	1.6	19.35	23.01	29.67	36.98	44.59	40250	47860	61710	76910	92750	NaN	NaN
1385	Industrial Truck and Tractor Operators	23	47830	0.2	17.55	19.13	22.3	25.81	29.59	36500	39780	46390	53680	61540	NaN	NaN

	occ_title	h_mean	a_mean	h_pct10	h_pct25	h_median	h_pct75	h_pct90	a_pct10	a_pct25	a_median	a_pct75	a_pct90	annual	hourly
61	Postmasters and Mail Superintendents	45.08	93760.0	39.15	41.9	44.58	47.88	52.47	81430	87150	92730	99590	109140	NaN	NaN
907	Postal Service Workers	28.71	59720.0	20.38	22.13	27.82	35.6	36.2	42390	46030	57870	74050	75300	NaN	NaN
908	Postal Service Clerks	29.36	61070.0	20.48	26.64	29.63	35.6	35.6	42600	55410	61630	74050	74050	NaN	NaN
909	Postal Service Mail Carriers	28.79	59880.0	20.38	22.13	27.64	36.2	36.96	42390	46030	57490	75300	76880	NaN	NaN
910	Postal Service Mail Sorters, Processors, and P...	28.04	58310.0	20.48	22.78	27.18	35.08	35.6	42600	47380	56530	72970	74050	NaN	NaN

	AREA	AREA_TITLE	AREA_TYPE	PRIM_STATE	NAICS_TITLE	I_GROUP	OWN_CODE	OCC_CODE	OCC_TITLE	...	H_MEDIAN	H_PCT75	H_PCT90	A_PCT10	A_PCT25	A_MEDIAN	A_PCT75	A_PCT90	ANNUAL	HOURLY
0	99	U.S.	1	US	Cross-industry	cross-industry	1235	00-0000	All Occupations	...	23.8	37.89	60.44	29990	36730	49500	78810	125720	NaN	NaN
1	99	U.S.	1	US	Cross-industry	cross-industry	1235	11-0000	Management Occupations	...	58.7	82.5	#	57010	79900	122090	171610	#	NaN	NaN
2	99	U.S.	1	US	Cross-industry	cross-industry	1235	11-1000	Top Executives	...	50.48	81.01	#	47510	68800	104990	168490	#	NaN	NaN
3	99	U.S.	1	US	Cross-industry	cross-industry	1235	11-1010	Chief Executives	...	99.24	#	#	73710	126080	206420	#	#	NaN	NaN
4	99	U.S.	1	US	Cross-industry	cross-industry	1235	11-1011	Chief Executives	...	99.24	#	#	73710	126080	206420	#	#	NaN	NaN

	May 2024 OEWS Estimates	Unnamed: 1	Unnamed: 2
0	NaN	NaN	NaN
1	Occupational Employment and Wage Statistics (O...	NaN	NaN
2	Bureau of Labor Statistics, Department of Labor	NaN	NaN
3	website: www.bls.gov/oes	NaN	NaN
4	email: oewsinfo@bls.gov	NaN	NaN
5	NaN	NaN	NaN
6	Not all fields are available for every type of...	NaN	NaN
7	NaN	NaN	NaN
8	Field	Field Description	NaN
9	area	U.S. (99), state FIPS code, Metropolitan Stati...	NaN
10	area_title	Area name	NaN
11	area_type	Area type: 1= U.S.; 2= State; 3= U.S. Territor...	NaN
12	prim_state	The primary state for the given area. "US" is ...	NaN
13	naics	North American Industry Classification System ...	NaN
14	naics_title	North American Industry Classification System ...	NaN

	Field	Field Description
28	a_pct75	Annual 75th percentile wage
29	a_pct90	Annual 90th percentile wage
30	annual	Contains "TRUE" if only annual wages are relea...
31	hourly	Contains "TRUE" if only hourly wages are relea...
32	NaN	NaN
33	Notes:	NaN
34	* = indicates that a wage estimate is not ava...	NaN
35	** = indicates that an employment estimate is...	NaN
36	# = indicates a wage equal to or greater than...	NaN
37	~ =indicates that the percent of establishment...	NaN

	area	area_title	area_type	prim_state	naics_title	i_group	own_code	occ_code	occ_title	...	h_median	h_pct75	h_pct90	a_pct10	a_pct25	a_median	a_pct75	a_pct90	annual	hourly
0	10180	Abilene, TX	4	TX	Cross-industry	cross-industry	1235	00-0000	All Occupations	...	20.25	29.34	45.3	23120	29640	42120	61020	94220	NaN	NaN
1	10180	Abilene, TX	4	TX	Cross-industry	cross-industry	1235	11-0000	Management Occupations	...	42.03	61.17	84.3	44710	61250	87420	127230	175350	NaN	NaN
2	10180	Abilene, TX	4	TX	Cross-industry	cross-industry	1235	11-1011	Chief Executives	...	85.35	#	#	99140	107040	177520	#	#	NaN	NaN
3	10180	Abilene, TX	4	TX	Cross-industry	cross-industry	1235	11-1021	General and Operations Managers	...	37.18	58.64	87.22	35980	50640	77340	121970	181420	NaN	NaN
4	10180	Abilene, TX	4	TX	Cross-industry	cross-industry	1235	11-2021	Marketing Managers	...	50.02	73.86	99.9	58480	82560	104030	153630	207790	NaN	NaN

	area	area_title	area_type	prim_state	naics_title	i_group	own_code	occ_code	occ_title	...	h_median	h_pct75	h_pct90	a_pct10	a_pct25	a_median	a_pct75	a_pct90	annual	hourly
0	100001	Northwest Alabama nonmetropolitan area	6	AL	Cross-industry	cross-industry	1235	00-0000	All Occupations	...	18.94	25	35.51	22380	29690	39400	52000	73870	NaN	NaN
1	100001	Northwest Alabama nonmetropolitan area	6	AL	Cross-industry	cross-industry	1235	11-0000	Management Occupations	...	40.54	58.5	80.72	44770	60730	84320	121680	167910	NaN	NaN
2	100001	Northwest Alabama nonmetropolitan area	6	AL	Cross-industry	cross-industry	1235	11-1011	Chief Executives	...	71.58	85.16	#	114610	121930	148890	177130	#	NaN	NaN
3	100001	Northwest Alabama nonmetropolitan area	6	AL	Cross-industry	cross-industry	1235	11-1021	General and Operations Managers	...	46.63	73.99	101.78	42680	63380	96980	153900	211700	NaN	NaN
4	100001	Northwest Alabama nonmetropolitan area	6	AL	Cross-industry	cross-industry	1235	11-1031	Legislators	...	*	*	*	18210	22690	51460	52020	52020	True	NaN

	area	area_title	area_type	prim_state	naics	naics_title	i_group	own_code	occ_code	occ_title	...	h_median	h_pct75	h_pct90	a_pct10	a_pct25	a_median	a_pct75	a_pct90	annual	hourly
0	99	U.S.	1	US	11	Agriculture, Forestry, Fishing and Hunting	sector	5	00-0000	All Occupations	...	17.66	22.21	30.23	33280	34680	36720	46200	62880	NaN	NaN
1	99	U.S.	1	US	11	Agriculture, Forestry, Fishing and Hunting	sector	5	11-0000	Management Occupations	...	46.08	63.4	90.74	41710	67090	95840	131870	188730	NaN	NaN
2	99	U.S.	1	US	11	Agriculture, Forestry, Fishing and Hunting	sector	5	11-1000	Top Executives	...	40.91	61.6	91.98	32100	57390	85080	128140	191320	NaN	NaN
3	99	U.S.	1	US	11	Agriculture, Forestry, Fishing and Hunting	sector	5	11-1010	Chief Executives	...	103.99	#	#	105020	154280	216290	#	#	NaN	NaN
4	99	U.S.	1	US	11	Agriculture, Forestry, Fishing and Hunting	sector	5	11-1011	Chief Executives	...	103.99	#	#	105020	154280	216290	#	#	NaN	NaN

	occ_code	occ_title	o_group	tot_emp	emp_prse	h_mean	a_mean
835	43-1000	Supervisors of Office and Administrative Suppo...	minor	1495580	0.4	34.4	71560
836	43-1010	First-Line Supervisors of Office and Administr...	broad	1495580	0.4	34.4	71560
793	41-1010	First-Line Supervisors of Sales Workers	broad	1332180	0.4	28.77	59830
792	41-1000	Supervisors of Sales Workers	minor	1332180	0.4	28.77	59830
815	41-3091	Sales Representatives of Services, Except Adve...	detailed	1189330	0.7	39.07	81260
...	...	...	...	...	...	...	...
1005	47-2142	Paperhangers	detailed	1520	13.7	24.95	51900
844	43-2099	Communications Equipment Operators, All Other	detailed	1390	9.9	26.87	55890
843	43-2090	Miscellaneous Communications Equipment Operators	broad	1390	9.9	26.87	55890
566	29-1243	Pediatric Surgeons	detailed	1050	23.4	216.74	450810
269	19-3032	Industrial-Organizational Psychologists	detailed	1050	24.7	64.62	134400

	occ_code
11	53-50
12	51-90
0	51-80
5	51-30
4	51-20
3	47-21
6	47-20
16	41-90
2	39-40
10	35-20
1	33-90
13	29-90
7	25-20
14	21-10
9	17-20
15	17-10
17	11-90
8	11-30

	occ_code	occ_title	o_group	tot_emp	emp_prse	h_mean	a_mean
21	11-3013	Facilities Managers	detailed	141090	0.7	55.06	114520
47	11-9039	Education Administrators, All Other	detailed	53330	1.9	47.82	99460
173	17-1020	Surveyors, Cartographers, and Photogrammetrists	broad	65870	2.4	37.46	77920
183	17-2040	Chemical Engineers	broad	20330	3.7	61.75	128430
312	21-1013	Marriage and Family Therapists	detailed	65870	4.0	34.96	72720
406	25-2023	Career/Technical Education Teachers, Middle Sc...	detailed	14200	3.6	*	68690
609	29-9099	Healthcare Practitioners and Technical Workers...	detailed	36970	1.9	35.19	73200
672	33-9031	Gambling Surveillance Officers and Gambling In...	detailed	10000	2.1	22.46	46710
692	35-2019	Cooks, All Other	detailed	23590	7.4	18.27	38000
760	39-4012	Crematory Operators	detailed	2950	5.7	21.53	44790
822	41-9011	Demonstrators and Product Promoters	detailed	64770	5.4	21.03	43730
995	47-2082	Tapers	detailed	12500	7.6	33.84	70390
1005	47-2142	Paperhangers	detailed	1520	13.7	24.95	51900
1153	51-2021	Coil Winders, Tapers, and Finishers	detailed	12170	4.0	23.53	48940
1170	51-3022	Meat, Poultry, and Fish Cutters and Trimmers	detailed	141090	1.2	18.58	38640
1255	51-8011	Nuclear Power Reactor Operators	detailed	5720	1.0	59.05	122830
1272	51-9021	Crushing, Grinding, and Polishing Machine Sett...	detailed	28550	3.3	23.75	49400
1352	53-5000	Water Transportation Workers	minor	77710	2.4	38.54	80150

	emp_prse_count
129	19.0
128	22.4
127	12.5
126	38.4
125	13.0
...	...
4	0.4
3	1.2
2	0.3
1	0.2
0	0.0

	occ_title	h_mean	a_mean	mean_prse	h_pct10	h_pct25	h_median	h_pct75	h_pct90	a_pct10	a_pct25	a_median	a_pct75	a_pct90	annual	hourly
7	Legislators	NaN	67390.0	1.6	*	*	*	*	*	20380	29120	44810	80350	137820	True	NaN
45	Education Administrators, Kindergarten through...	NaN	113360.0	0.4	*	*	*	*	*	72400	83840	104070	132550	165820	True	NaN

About This notebook¶

Link to Data¶

Project Setup¶

Import and Verifications¶

Create SQL Engine and Test Connection¶

Import BLS Files¶

Test Data in Each Imported File¶

Rough Wrangling¶

National Data¶

Rough Wrangle National BLS Data¶

Rough Wrangle National Field Descriptions¶

Split Descriptions and Notes¶

Storing Notes in DataFrame¶

State Data¶

Rough Wrangle State Data¶

Rough Wrangle State Field Descriptions¶

Metropolitan Data¶

Rough Wrangle Metropolitan Data¶

Rough Wrangle Metropolitan Field Descriptions¶

Balance of State Data¶

Rough Wrangle Balance of State Data¶

Rough Wrangle Balance of State Field Descriptions¶

National Sector Data¶

Rough Wrangle National Sector Data¶

Rough Wrangle National Sector Field Descriptions¶

Rough Wrangle Closing Commentary¶

Full Wrangle¶

National Data¶

Basic Inspection¶

Removing Blank Columns¶

Confirm Necessity of Deletion¶

Add To Do List¶

Data¶

Descriptions¶

Remote Columns from National Data¶

Remote Rows from National Descriptions¶

Cross Off To Do List¶

Data¶

Descriptions¶

National Data Area-Related Columns Wrangling¶

Industry Classifications¶

occ_code: Standard Occupation Classifications¶

Add To Do Item¶

Clean Duplicate Rows¶

Mark To Do Item Complete¶

occ_title: SOC Column Cleaning Continuation¶

Add To Do Item¶

Clean Duplicate Rows¶

Part 2: Analyzing for Additional Duplicates via occ_title¶

Part 3: Dropping Additional Duplicates¶

Remove To Do Item¶

o_group: SOC Column Cleaning Continued¶

'tot_emp' Wrangle¶

Observations¶

Add To Do Item¶

Isolating by occ_code¶

Compare by occ_code Starting Digits¶

Observe tot_emp Duplicates with Only One occ_code Instance Each¶

Analysis¶

Update To Do Items¶

Compare Subgroups to Each Other¶

Compare Against emp_prse Instead¶

Update To Do Items¶

Create a Custom Function¶

Call the Custom Function¶

Verification¶

Prepare for Drop¶

Remove Non-Duplicates From Filtered Dataframe¶

Remove Rows in Filtered Dataframe from Main National Dataframe¶

Remove To Do Items¶

emp_prse Wrangle¶

Update To Do Items¶

Observations on emp_prse¶

Update To Do Items¶

mean_prse Wrangle¶

Update To Do Items¶

Update To Do Items¶

Hourly and Annual Data Analysis¶

Initial Analysis and Grouping of Columns¶

Commentary¶

`occ_code`: Standard Occupation Classifications¶

`occ_title`: SOC Column Cleaning Continuation¶

Part 2: Analyzing for Additional Duplicates via `occ_title`¶

Isolating by `occ_code`¶

Compare by `occ_code` Starting Digits¶

Observe `tot_emp` Duplicates with Only One `occ_code` Instance Each¶

Compare Against `emp_prse` Instead¶

`emp_prse` Wrangle¶

Observations on `emp_prse`¶

`mean_prse` Wrangle¶

Analysis for `int` Values¶

Analysis for `str` Values¶

Comparing `h_mean` and `a_mean`¶

Analyzing the `hourly` and `annual` Columns¶

`h_mean`¶

Update `nat_data`¶

Converting `*` to `str`¶

Update `nat_data`¶

`h_pct10`¶

Converting `*` to `str`¶

Update `nat_data`¶

`h_pct25`¶

Converting `*` to `str`¶

Setting `*` to `NaN`¶

Update `nat_data`¶

`h_median`¶

Converting `*` to `str`¶

Setting `*` to `NaN`¶

Update `nat_data`¶

`h_pct75`¶

`h_pct90`¶

`a_pct10`¶

`a_pct25`¶

`a_median`¶

`a_pct75`¶

`a_pct90`¶

`annual`¶

`hourly`¶