import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sqlalchemy

print(sqlalchemy.__version__)

2.0.48

from sqlalchemy import create_engine, inspect
from sqlalchemy import text
from sqlalchemy.schema import CreateSchema
from sqlalchemy import BigInteger, Column, Integer, String, Float, Boolean
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column

engine = create_engine('postgresql+psycopg2://postgres@localhost/train_reward_compare')

with engine.connect() as conn:
    result = conn.execute(text("select count(*) from onet.occupation_data"))
    print(result.fetchone())

(1016,)

# Import each dataset via sqlalchemy into a pandas df
# Adapted from Google search, 'pandas how to read in a table sqlalchemy postgresql'

ete = pd.DataFrame()

with engine.connect() as conn:
    ete = pd.read_sql_table('education_training_experience', schema='onet', con=engine)

ete_categories = pd.DataFrame()

with engine.connect() as conn:
    ete_categories = pd.read_sql_table('ete_categories', schema='onet', con=engine)

od = pd.DataFrame()

with engine.connect() as conn:
    od = pd.read_sql_table('occupation_data', schema='onet', con=engine)

cmr = pd.DataFrame()

with engine.connect() as conn:
    cmr = pd.read_sql_table('content_model_reference', schema='onet', con=engine)

sr = pd.DataFrame()

with engine.connect() as conn:
    sr = pd.read_sql_table('scales_reference', schema='onet', con=engine)

ete.head()

ete.duplicated().sum()

np.int64(0)

ete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37125 entries, 0 to 37124
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   onetsoc_code        37125 non-null  object        
 1   element_id          37125 non-null  object        
 2   scale_id            37125 non-null  object        
 3   category            35998 non-null  float64       
 4   data_value          37125 non-null  float64       
 5   n                   37125 non-null  float64       
 6   standard_error      28115 non-null  float64       
 7   lower_ci_bound      17736 non-null  float64       
 8   upper_ci_bound      17736 non-null  float64       
 9   recommend_suppress  28115 non-null  object        
 10  date_updated        37125 non-null  datetime64[ns]
 11  domain_source       37125 non-null  object        
dtypes: datetime64[ns](1), float64(6), object(5)
memory usage: 3.4+ MB

ete.shape

(37125, 12)

ete['onetsoc_code'].describe()

count          37125
unique           878
top       11-1011.00
freq              43
Name: onetsoc_code, dtype: object

ete['onetsoc_code']

0        11-1011.00
1        11-1011.00
2        11-1011.00
3        11-1011.00
4        11-1011.00
            ...    
37120    53-7121.00
37121    53-7121.00
37122    53-7121.00
37123    53-7121.00
37124    53-7121.00
Name: onetsoc_code, Length: 37125, dtype: object

ete['onetsoc_code'].isna().sum()

np.int64(0)

ete['element_id']

0        2.D.1
1        2.D.1
2        2.D.1
3        2.D.1
4        2.D.1
         ...  
37120    3.A.3
37121    3.A.3
37122    3.A.3
37123    3.A.3
37124    3.A.3
Name: element_id, Length: 37125, dtype: object

ete['element_id'].describe()

count     37125
unique        6
top       2.D.1
freq      10536
Name: element_id, dtype: object

ete['element_id'].isna().sum()

np.int64(0)

ete['scale_id'].describe()

count     37125
unique        5
top          RL
freq      10536
Name: scale_id, dtype: object

ete['scale_id'].value_counts()

scale_id
RL    10536
RW     9658
PT     7902
OJ     7902
IM     1127
Name: count, dtype: int64

ete['scale_id'].isna().sum()

np.int64(0)

ete['category']

0        1.0
1        2.0
2        3.0
3        4.0
4        5.0
        ... 
37120    5.0
37121    6.0
37122    7.0
37123    8.0
37124    9.0
Name: category, Length: 37125, dtype: float64

ete['category'].describe()

count    35998.000000
mean         5.707317
std          3.086156
min          1.000000
25%          3.000000
50%          6.000000
75%          8.000000
max         12.000000
Name: category, dtype: float64

ete['category'].isna().sum()

np.int64(1127)

ete[ete['category'].isna() == True]

ete['data_value']

0         0.00
1         4.46
2         0.00
3         0.00
4         5.15
         ...  
37120    27.84
37121    12.27
37122     1.21
37123     1.57
37124     0.00
Name: data_value, Length: 37125, dtype: float64

ete['data_value'].describe()

count    37125.000000
mean         9.539695
std         13.901753
min          0.000000
25%          0.000000
50%          3.570000
75%         14.290000
max        100.000000
Name: data_value, dtype: float64

ete['data_value'].isna().sum()

np.int64(0)

ete['n']

0        28.0
1        28.0
2        28.0
3        28.0
4        28.0
         ... 
37120    27.0
37121    27.0
37122    27.0
37123    27.0
37124    27.0
Name: n, Length: 37125, dtype: float64

ete['n'].isna().sum()

np.int64(0)

ete['n'].describe()

count    37125.000000
mean        25.171152
std          9.474034
min          9.000000
25%         20.000000
50%         23.000000
75%         28.000000
max         98.000000
Name: n, dtype: float64

ete['standard_error'].describe()

count    28115.000000
mean         4.704188
std          5.432629
min          0.000000
25%          0.000000
50%          2.311300
75%          8.669050
max         29.734500
Name: standard_error, dtype: float64

ete['lower_ci_bound'].describe()

count    17736.000000
mean         5.344971
std          8.259175
min          0.001200
25%          0.720125
50%          2.181450
75%          6.349525
max         88.527500
Name: lower_ci_bound, dtype: float64

ete['upper_ci_bound'].describe()

count    17736.000000
mean        38.158627
std         22.817287
min          0.103800
25%         19.622000
50%         38.295200
75%         54.458500
max         99.642700
Name: upper_ci_bound, dtype: float64

ete['recommend_suppress'].describe()

count     28115
unique        2
top           N
freq      27249
Name: recommend_suppress, dtype: object

ete['recommend_suppress'].value_counts()

recommend_suppress
N    27249
Y      866
Name: count, dtype: int64

ete['recommend_suppress'].isna().sum()

np.int64(9010)

ete['date_updated'].describe()

count                            37125
mean     2020-04-11 18:41:09.818181632
min                2004-12-01 00:00:00
25%                2017-07-01 00:00:00
50%                2020-08-01 00:00:00
75%                2023-08-01 00:00:00
max                2025-08-01 00:00:00
Name: date_updated, dtype: object

ete['domain_source'].describe()

count         37125
unique            2
top       Incumbent
freq          28115
Name: domain_source, dtype: object

ete['domain_source'].value_counts()

domain_source
Incumbent              28115
Occupational Expert     9010
Name: count, dtype: int64

ete_categories.head()

ete_categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   element_id            41 non-null     object 
 1   scale_id              41 non-null     object 
 2   category              41 non-null     float64
 3   category_description  41 non-null     object 
dtypes: float64(1), object(3)
memory usage: 1.4+ KB

ete_categories.duplicated().sum()

np.int64(0)

od.head()

od.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1016 entries, 0 to 1015
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   onetsoc_code  1016 non-null   object
 1   title         1016 non-null   object
 2   description   1016 non-null   object
dtypes: object(3)
memory usage: 23.9+ KB

cmr.head()

sr.head()

	onetsoc_code	element_id	scale_id	category	data_value	n	standard_error	lower_ci_bound	upper_ci_bound	recommend_suppress	date_updated	domain_source
12	11-1011.00	2.D.4.a	IM	NaN	3.01	28.0	0.2773	2.4454	3.5834	N	2023-08-01	Incumbent
42	11-1011.00	3.A.4.a	IM	NaN	2.20	27.0	0.3437	1.4926	2.9055	N	2023-08-01	Incumbent
55	11-1011.03	2.D.4.a	IM	NaN	2.70	27.0	NaN	NaN	NaN	None	2021-08-01	Occupational Expert
85	11-1011.03	3.A.4.a	IM	NaN	1.81	27.0	NaN	NaN	NaN	None	2021-08-01	Occupational Expert
98	11-1021.00	2.D.4.a	IM	NaN	1.88	28.0	0.2508	1.3704	2.3996	N	2023-08-01	Incumbent
...	...	...	...	...	...	...	...	...	...	...	...	...
36915	53-7065.00	3.A.4.a	IM	NaN	1.26	61.0	0.1361	1.0000	1.5365	N	2022-08-01	Incumbent
36969	53-7072.00	2.D.4.a	IM	NaN	1.81	19.0	0.4649	1.0000	2.7914	N	2021-08-01	Incumbent
36999	53-7072.00	3.A.4.a	IM	NaN	1.97	19.0	0.2854	1.3720	2.5714	N	2021-08-01	Incumbent
37053	53-7081.00	2.D.4.a	IM	NaN	3.07	20.0	0.2438	2.5557	3.5762	N	2023-08-01	Incumbent
37083	53-7081.00	3.A.4.a	IM	NaN	2.06	20.0	0.1651	1.7117	2.4028	N	2023-08-01	Incumbent

	onetsoc_code	title	description
0	11-1011.00	Chief Executives	Determine and formulate policies and provide o...
1	11-1011.03	Chief Sustainability Officers	Communicate and coordinate with management, sh...
2	11-1021.00	General and Operations Managers	Plan, direct, or coordinate the operations of ...
3	11-1031.00	Legislators	Develop, introduce, or enact laws and statutes...
4	11-2011.00	Advertising and Promotions Managers	Plan, direct, or coordinate advertising polici...

	element_id	element_name	description
0	1	Worker Characteristics	Worker Characteristics
1	1.A	Abilities	Enduring attributes of the individual that inf...
2	1.A.1	Cognitive Abilities	Abilities that influence the acquisition and a...
3	1.A.1.a	Verbal Abilities	Abilities that influence the acquisition and a...
4	1.A.1.a.1	Oral Comprehension	The ability to listen to and understand inform...

	scale_id	scale_name	minimum	maximum
0	AO	Automation	1.0	5.0
1	CF	Frequency	1.0	5.0
2	CN	Amount of Contact	1.0	5.0
3	CT	Context	1.0	3.0
4	CTP	Context (Categories 1-3)	0.0	100.0

About This notebook¶

Link to Data¶

Project Setup¶

Import and Verifications¶

Create SQL Engine and Test Connection¶

Import Data from PostgreSQL¶

List of Imports¶

Import All Datasets¶

`ete`: Education, Training, Experience¶

`onetsoc_code`¶

`element_id`¶

`scale_id`¶

`category`¶

`data_value`¶

`n`¶

`standard_error`¶

`lower_ci_bound`¶

`upper_ci_bound`¶

`recommend_suppress`¶

`date_updated`¶

`domain_source`¶

`ete_categories`¶

Observations¶

`occupation_data`¶

Observations¶

`content_model_reference`¶

Observations¶

`scales_reference`¶

Final Observations¶

	onetsoc_code	element_id	scale_id	category	data_value	n	standard_error	lower_ci_bound	upper_ci_bound	recommend_suppress	date_updated	domain_source
0	11-1011.00	2.D.1	RL	1.0	0.00	28.0	0.0000	NaN	NaN	N	2023-08-01	Incumbent
1	11-1011.00	2.D.1	RL	2.0	4.46	28.0	4.1428	0.6307	25.5524	N	2023-08-01	Incumbent
2	11-1011.00	2.D.1	RL	3.0	0.00	28.0	0.0000	NaN	NaN	N	2023-08-01	Incumbent
3	11-1011.00	2.D.1	RL	4.0	0.00	28.0	0.0000	NaN	NaN	N	2023-08-01	Incumbent
4	11-1011.00	2.D.1	RL	5.0	5.15	28.0	5.2236	0.6000	32.7756	N	2023-08-01	Incumbent

	element_id	scale_id	category	category_description
0	2.D.1	RL	1.0	Less than a High School Diploma
1	2.D.1	RL	2.0	High School Diploma - or the equivalent (for e...
2	2.D.1	RL	3.0	Post-Secondary Certificate - awarded for train...
3	2.D.1	RL	4.0	Some College Courses
4	2.D.1	RL	5.0	Associate's Degree (or other 2-year degree)

About This notebook¶

Link to Data¶

Project Setup¶

Import and Verifications¶

Create SQL Engine and Test Connection¶

Import Data from PostgreSQL¶

List of Imports¶

Import All Datasets¶

ete: Education, Training, Experience¶

onetsoc_code¶

element_id¶

scale_id¶

category¶

data_value¶

n¶

standard_error¶

lower_ci_bound¶

upper_ci_bound¶

recommend_suppress¶

date_updated¶

domain_source¶

ete_categories¶

Observations¶

occupation_data¶

Observations¶

content_model_reference¶

Observations¶

scales_reference¶

Final Observations¶

`ete`: Education, Training, Experience¶

`onetsoc_code`¶

`element_id`¶

`scale_id`¶

`category`¶

`data_value`¶

`n`¶

`standard_error`¶

`lower_ci_bound`¶

`upper_ci_bound`¶

`recommend_suppress`¶

`date_updated`¶

`domain_source`¶

`ete_categories`¶

`occupation_data`¶

`content_model_reference`¶

`scales_reference`¶