Data Management Decisions – code – Data Analysis and Interpretation Specialization

Summary: I am working with Outlook of Life dataset. Looking at womans positions like leadership, welfare vision, discrimination so for now I will include many relevant variables in my personal codebook

Data Management:
I re-coded the variables in Category 1 to be more intuitive instead of 0 – bad proposal and 10 – very good proposal I switched them to be 0 – good proposal and 10 – bad proposal
I coded in “refused to answer” records in my original count, as I would like to see them
I split Category 1 into 5 groups (vs 4 in the video example). This way my category is similar to my other categories. In this case I codded out missing records
In Category 2 I codded in records and labelled them “refused”
In Category 3 I codded in labeled “missing” and “refused” records
In Category 4 I codded in labeled “missing” and “refused” records

Frequency distributions
Original counts are still in my output so they can be compared to data managed counts. Both are labelled “original” and “data managed” respectively.

Program code t is below. Program output is on separate blog post


#!/opt/local/bin/python

# -*- coding: utf-8 -*-
import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)
warnings.simplefilter(action = "ignore", category = UserWarning)
import pandas as pd
import numpy
# any additional libraries would be imported here

data = pd.read_csv( 'ool_pds.csv',sep=',',error_bad_lines=False)
pd.set_option('display.float_format', lambda x:'%f'%x)

# print "****", (len(data))
# # number of observations (rows)
# print "----",(len(data.columns)) # number of variables (columns)

# any additional libraries would be imported here

# Data Management:
# I recoded the variables in Category 1 to be more intuitive instead of 0 - bad proposal and 10 - very good proposal
# I switched them to be 0 - good proposal and 10 - bad proposal
# I labeled refused to answer records as I would like to see them, but dropped them when I split into 5 categories as
# they do n't provide useful infomation.' \
# I split into 5 categories (vs 4 in the video example). This way my caregory is similar to my other categories.
# In Category 2 I labeled refused records
# In Category 3 I labeled missing and refused records
# In Category 4 I labeled missing and refused records
# Original counts are still in my output so they can be compared to data managed counts.


# print type(data)
#setting variables you will be working with to numeric
data['W1_J1_D'] = data['W1_J1_D'].convert_objects(convert_numeric=True)
data['W2_QF4A'] = data['W2_QF4A'].convert_objects(convert_numeric=True)
data['W2_QF5A'] = data['W2_QF5A'].convert_objects(convert_numeric=True)
data['W2_QH1'] = data['W2_QH1'].convert_objects(convert_numeric=True)
data['W2_QF9'] = data['W2_QF9'].convert_objects(convert_numeric=True)

# data['AGE'] = data['AGE'].convert_objects(convert_numeric=True)
print "--------------------------------------------------------------------------------------------------------------"
print " Using OutlookOnLife dataset I looked at 4 categories:"
print "--------------------------------------------------------------------------------------------------------------"
print

print
#counts and percentages (i.e. frequency distributions) for each variable
print " Category 1: Require that an equal number of the top leadership positions in government go to women"
print (len(data['W1_J1_D'])), "number of observations (number of people)"

print
print "number of people voted as 0 - bad proposal and 10 - very good proposal "
print "interpretation - the biggest number of people voted 5 wchi is on the middle. This does not agree on equality of " \
 "women getting on this positions "
print "original"
data['W1_J1_D']=data['W1_J1_D'].replace(-1, 'refused')
c1 = data['W1_J1_D'].value_counts(sort=False)

c1 = c1.sort_index()
print c1
print
p1 = data['W1_J1_D'].value_counts(sort=False, normalize=True)
p1 = p1.sort_index()
print p1
print
# example recoding values
# #recoding values for S3AQ3B1 into a new variable, USFREQ
print "data managed"
recode1 = { 0:10, 1: 9, 2: 8, 3: 7, 4: 6, 5: 5, 6: 4, 7: 3, 8: 2, 9: 1, 10: 0}
data['REV_W1_J1_D'] = data['W1_J1_D'].map(recode1)
# data['REV_W1_J1_D'] = data['REV_W1_J1_D'].convert_objects(convert_numeric=True)
# data['REV_W1_J1_D']=data['REV_W1_J1_D'].replace(-1, 'refused')
c1 = data['REV_W1_J1_D'].value_counts(sort=False)
c1 = c1.sort_index()
print c1
# recode1 = {0: 10, 1: 9, 2: 8, 3: 7, 4: 6, 5: 5, 6: 4, 7: 3, 8: 2, 9: 1, 10: 0, -1: -1}
# data['REV2_W1_J1_D'] = data['W1_J1_D'].map(recode1)
p1 = data['REV_W1_J1_D'].value_counts(sort=False, normalize=True)
p1 = p1.sort_index()
print p1

# #recoding values for S3AQ3B1 into a new variable, USFREQMO
# recode2 = {0: 10, 1: 9, 2: 8, 3: 7, 4: 6, 5: 5, 6: 4, 7: 3, 8: 2, 9: 1, 10: 0}
# data['REV_W1_J1_D']= data['W1_J1_D'].map(recode1)

#recoding values for S3AQ3B1 into a new variable, USFREQ
# recode1 = {1: 6, 2: 5, 3: 4, 4: 3, 5: 2, 6: 1}
# sub2['USFREQ']= sub2['S3AQ3B1'].map(recode1)
#
# #recoding values for S3AQ3B1 into a new variable, USFREQMO
# recode2 = {1: 30, 2: 22, 3: 14, 4: 5, 5: 2.5, 6: 1}
# sub2['USFREQMO']= sub2['S3AQ3B1'].map(recode2)

# quartile split 10 steps records into 4 categories
# print 'Category 1 - 4 categories - quartiles'
#
# data['Cat1_4']=pd.qcut(data.REV_W1_J1_D, 4, labels=["1=0%tile","2=25%tile","3=50%tile","4=75%tile"])
#
# split = data['Cat1_4'].value_counts(sort=False, dropna=True)
# split = split.sort_index()
# print split
print
# quartile split 10 steps records into 5 categories
print 'Category 1 split into 5 groups'
data['Cat1_5']=pd.cut(data.REV_W1_J1_D, 5, labels=["1=0%tile","2=20%tile","3=40%tile","4=60%tile","5=80%tile"])
data['Cat1_5']=data['Cat1_5'].replace(10, numpy.nan)
split = data['Cat1_5'].value_counts(sort=False, dropna=True)
split = split.sort_index()
print split



print
print "--------------------------------------------------------------------------------------------------------------"
print "Category 2: Churches or places of worship should allow more women to become members of the clergy."
print (len(data['W1_M4'])), "number of observations (number of people)"
print "number of people voted as 1 - strongly agree and 4 - strongly disagree "
print "interpretation - the biggest number of people somewhat agree. This does not agree on equality of " \
 "women getting on this positions " \
 "Also relatively large number of people refused to answer this question"

print "data managed"
data['W1_M4']=data['W1_M4'].replace(-1, 'refused')
c2 = data['W1_M4'].value_counts(sort=False)
c2 = c2.sort_index()
print(c2)
p2 = data['W1_M4'].value_counts(sort=True, normalize=True)
print (p2)
print
print "---------------------------------------------------------------------------------------------------------------"
print "Category 3: How concerned are you personally about women's rights?"

print (len(data['W2_QF9'])), "number of observations (number of people)"
print "number of people voted as 1 - strongly agree and 4 - strongly disagree "
print "interpretation - the biggest number of people somewhat agree. This does not agree on equality of " \
 "women getting on this positions " \
 "Relatively large number of people refused to answer this question"

print "original"
c3 = data['W2_QF9'].value_counts(sort=False)
c3 = c3.sort_index()
print c3
p3 = data['W2_QF9'].value_counts(sort=False, normalize=True)
print p3
print
print "data managed"
data['W2_QF9']=data['W2_QF9'].replace(-1, 'refused')
# data['W1_M4']=data['W1_M4'].replace(-1, 'refused')
data['W2_QF9']=data['W2_QF9'].fillna('missing', inplace=False)
# data ['W2_QH1']=data['W2_QH1'].fillna('missing', inplace=False)
c3 = data['W2_QF9'].value_counts(sort=False)
c3 = c3.sort_index()
print c3
p3 = data['W2_QF9'].value_counts(sort=False, normalize=True)
p3 = p3.sort_index()
print p3
print
print "--------------------------------------------------------------------------------------------------------------"
print "Category 4:Discrimination against women is no longer a problem in the U.S"


print " Discrimination against women is no longer a problem in the U.S. Do you agrees" \
 "people responded with 1 - Agree strongly to 5 disagree strongly and everything in between "
print "interpretation - 580 people somewhat disagree, which means that this is still an issue. " \
 "Large number of records (693) are missing from this variable"
# data['W2_QH1'] = (pd.to_numeric(data['W2_QH1']))
# data['W2_QH1'] = data.W2_QH1.astype(numpy.float64)
print "original"
print (len(data['W2_QH1'])), "number of observations (number of people)"
#if you want to include a count of missing add ,dropna=False after sort=False
c4 = data['W2_QH1'].value_counts(sort=False, dropna=False)
c4 = c4.sort_index()
print(c4)
p4 = data['W2_QH1'].value_counts(sort=False, normalize=True)
print (p4)
print
print "data managed"

# data management does't work if variable is not numeric
# examples
# sort by values
# print c4a.value_counts(sort=True)
# sort by variable
# c4 = c4.sort_index()
# convert to numeric - new method
# c4 = pd.to_numeric((pd.to_numeric(data['W2_QH1'], errors='coerce')))

# examples
# replace missing values to NaN (this is how python represents missing data)
# data['W2_QH1']=data['W2_QH1'].replace(6, numpy.nan)
#replace NaN with 11
# data ['VAR'] = sub2['S2AQ8A'].fillna(11, inplace=True)
data ['W2_QH1'] = data['W2_QH1'].fillna('missing', inplace=False)
data['W2_QH1']=data['W2_QH1'].replace(-1, 'refused')
c4=data['W2_QH1'].value_counts(sort=False)
print (len(data['W2_QH1'])), "number of observations (number of people)"
c4 = c4.sort_index()
print c4
p4 = data['W2_QH1'].value_counts(sort=False, normalize=True)
print (p4).sort_index()
print "---------------------------------------------------------------------------------------------------------------"
print

Data Analysis and Interpretation Specialization

Data Management Decisions – code

datapret

Leave a comment Cancel reply

Search

Text Widget