US Census in 2014

US Census Data Analysis (Census Data 2014)

In [9]:
# Total Population Citizen - Total by ages
ax = df_vote.plot(x='Age', y=['Total Population', 'Total Citizen Population'], kind='bar', figsize=(16, 5))
ax.set_ylabel('Population (K)')
plt.show()

Import library

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

Data Load

About Dataset

  • Unit for population is thousands (K)
In [2]:
path_data = 'data/reported_voting_and_registration_both_by_age_2014.csv'
df_vote = pd.read_table(path_data, sep=',', header=1)
In [3]:
df_vote.head()
Out[3]:
Unnamed: 0 Unnamed: 1 Total Citizen Population Reported registered Unnamed: 4 Reported not registered Unnamed: 6 No response to registration 1 Unnamed: 8 Reported voted Unnamed: 10 Reported did not vote Unnamed: 12 No response to voting 2 Unnamed: 14 Reported registered.1 Reported voted.1
0 Ages NaN NaN Number Percent Number Percent Number Percent Number Percent Number Percent Number Percent Percent Percent
1 ..18 years 3,920 3,719 951 25.6 1,643 44.2 1,124 30.2 465 12.5 2,199 59.1 1,054 28.4 24.3 11.9
2 ..19 years 3,712 3,489 1,204 34.5 1,428 40.9 856 24.6 526 15.1 2,228 63.9 734 21.0 32.4 14.2
3 ..20 years 4,262 3,998 1,719 43.0 1,308 32.7 971 24.3 692 17.3 2,422 60.6 884 22.1 40.3 16.2
4 ..21 years 4,220 3,913 1,699 43.4 1,250 31.9 965 24.7 605 15.5 2,424 61.9 884 22.6 40.3 14.3

Data Cleansing

In [4]:
df_vote.columns
Out[4]:
Index(['Unnamed: 0', 'Unnamed: 1', 'Total Citizen Population',
       'Reported registered', 'Unnamed: 4', 'Reported not registered',
       'Unnamed: 6', 'No response to registration 1', 'Unnamed: 8',
       'Reported voted', 'Unnamed: 10', 'Reported did not vote', 'Unnamed: 12',
       'No response to voting 2', 'Unnamed: 14', 'Reported registered.1',
       'Reported voted.1'],
      dtype='object')
In [5]:
# Rename some columns for original column name
df_vote.rename(columns={'Unnamed: 0': 'Age', 'Unnamed: 1':'Total Population', 
                       'No response to registration 1': 'No response to registration',
                        'No response to voting 2': 'No response to voting'
                       }, inplace=True)


# Drop the columns starting with Unnamed, which contains Percent
df_vote = df_vote[[col for col in df_vote.columns if not col.startswith('Unnamed')]]
df_vote.head()
Out[5]:
Age Total Population Total Citizen Population Reported registered Reported not registered No response to registration Reported voted Reported did not vote No response to voting Reported registered.1 Reported voted.1
0 Ages NaN NaN Number Number Number Number Number Number Percent Percent
1 ..18 years 3,920 3,719 951 1,643 1,124 465 2,199 1,054 24.3 11.9
2 ..19 years 3,712 3,489 1,204 1,428 856 526 2,228 734 32.4 14.2
3 ..20 years 4,262 3,998 1,719 1,308 971 692 2,422 884 40.3 16.2
4 ..21 years 4,220 3,913 1,699 1,250 965 605 2,424 884 40.3 14.3
In [6]:
# Drop first row
df_vote.drop(0, axis=0, inplace=True)

# Drop last and second last columns
df_vote.drop(df_vote.iloc[:, -2:].columns, axis=1, inplace=True)

df_vote.head()
Out[6]:
Age Total Population Total Citizen Population Reported registered Reported not registered No response to registration Reported voted Reported did not vote No response to voting
1 ..18 years 3,920 3,719 951 1,643 1,124 465 2,199 1,054
2 ..19 years 3,712 3,489 1,204 1,428 856 526 2,228 734
3 ..20 years 4,262 3,998 1,719 1,308 971 692 2,422 884
4 ..21 years 4,220 3,913 1,699 1,250 965 605 2,424 884
5 ..22 years 4,442 4,181 2,015 1,293 872 790 2,549 841
In [7]:
# Extract numbers from 'Age' columns
df_vote.loc[:, 'Age'] = df_vote.loc[:, 'Age'].str.extract('(\d+)', expand=True)

# Convert string numbers into float 
for col in df_vote.columns:
    df_vote.loc[:, col] = df_vote.loc[:, col].replace(',', '', regex=True).astype(int)

    
df_vote.head()
Out[7]:
Age Total Population Total Citizen Population Reported registered Reported not registered No response to registration Reported voted Reported did not vote No response to voting
1 18 3920 3719 951 1643 1124 465 2199 1054
2 19 3712 3489 1204 1428 856 526 2228 734
3 20 4262 3998 1719 1308 971 692 2422 884
4 21 4220 3913 1699 1250 965 605 2424 884
5 22 4442 4181 2015 1293 872 790 2549 841

Visualization for Data Understanding

In [8]:
df_vote.columns
Out[8]:
Index(['Age', 'Total Population', 'Total Citizen Population',
       'Reported registered', 'Reported not registered',
       'No response to registration', 'Reported voted',
       'Reported did not vote', 'No response to voting'],
      dtype='object')
In [9]:
# Total Population Citizen - Total by ages
ax = df_vote.plot(x='Age', y=['Total Population', 'Total Citizen Population'], kind='bar', figsize=(16, 5))
ax.set_ylabel('Population (K)')
plt.show()
In [10]:
# Reported registered - Reported not registered by ages
ax = df_vote.plot(x='Age', y=['Reported registered', 'Reported not registered', 'No response to registration'], kind='bar', figsize=(16, 5))
ax.set_ylabel('Population (K)')
plt.show()
In [11]:
# Reported voted - Reported not voted by ages
ax = df_vote.plot(x='Age', y=['Reported voted', 'Reported did not vote', 'No response to voting'], kind='bar', figsize=(16, 5))
ax.set_ylabel('Population (K)')
plt.show()

Ratio of citizenship / registration / voting

In [12]:
# Ratio of Citizen by ages
plt.figure(figsize=(16, 5))
plt.plot(df_vote['Age'], df_vote['Total Citizen Population'] / df_vote['Total Population'], label='Citizen Ratio')
plt.legend()
plt.ylabel('Ratio')
plt.show()
In [13]:
# Ratio of registeration by ages
plt.figure(figsize=(16, 5))
plt.plot(df_vote['Age'], df_vote['Reported registered'] / df_vote['Total Population'], label='Registered')
plt.plot(df_vote['Age'], df_vote['Reported not registered'] / df_vote['Total Population'], label='Not registered')
plt.plot(df_vote['Age'], df_vote['No response to registration'] / df_vote['Total Population'], label='No response to registration')
legend = plt.legend()
plt.ylabel('Ratio')
plt.show()
In [14]:
# Ratio of voting by ages
plt.figure(figsize=(16, 5))
plt.plot(df_vote['Age'], df_vote['Reported voted'] / df_vote['Total Population'], label='Voted')
plt.plot(df_vote['Age'], df_vote['Reported did not vote'] / df_vote['Total Population'], label='Not voted')
plt.plot(df_vote['Age'], df_vote['No response to voting'] / df_vote['Total Population'], label='No respond to voting')
legend = plt.legend()
plt.ylabel('Ratio')
plt.show()
In [ ]:
 

Comments

Comments powered by Disqus