Code for data graphs

#!/home/zlatka/anaconda3/envs/bin/python

# -*- coding: utf-8 -*-
import warnings
warnings.simplefilter(action = “ignore”, category = FutureWarning)
warnings.simplefilter(action = “ignore”, category = UserWarning)
import pandas as pd
import numpy as np
import seaborn
import sys
import matplotlib.pyplot as plt
#import plotly.plotly as py
#import plotly.tools as tls
plt.show()
# any additional libraries would be imported here
print (sys.version)
data = pd.read_csv( ‘ool_pds.csv’,sep=’,’,error_bad_lines=False)
pd.set_option(‘display.float_format’, lambda x:’%f’%x)

# print “****”, (len(data))
# # number of observations (rows)
# print “—-“,(len(data.columns)) # number of variables (columns)

# print type(data)
#setting variables you will be working with to numeric
data[‘W1_J1_D’] = data[‘W1_J1_D’].convert_objects(convert_numeric=True)
data[‘W2_QF4A’] = data[‘W2_QF4A’].convert_objects(convert_numeric=True)
data[‘W2_QF5A’] = data[‘W2_QF5A’].convert_objects(convert_numeric=True)
data[‘W2_QH1’] = data[‘W2_QH1’].convert_objects(convert_numeric=True)
data[‘W2_QF9’] = data[‘W2_QF9’].convert_objects(convert_numeric=True)

# data[‘AGE’] = data[‘AGE’].convert_objects(convert_numeric=True)
# print(“————————————————————————————————————–“)
# print(” Using OutlookOnLife dataset I looked at 4 categories:”)
# print(“————————————————————————————————————–“)
# print()
#
# print()
# #counts and percentages (i.e. frequency distributions) for each variable
# print(” Category 1: Require that an equal number of the top leadership positions in government go to women”)
# print((len(data[‘W1_J1_D’])), “number of observations (number of people)”)
#
# print()
# print(“number of people voted as 0 – bad proposal and 10 – very good proposal “)
# print(“interpretation – the biggest number of people voted 5 wchi is on the middle. This does not agree on equality of ” \
# “women getting on this positions “)
# print(“original”)
# data[‘W1_J1_D’]=data[‘W1_J1_D’].replace(-1, ‘refused’)
#
# c1 = data[‘W1_J1_D’].value_counts(sort=False)
# #index.sorted before
# # print(“sorted”, sorted(c1))
# # print (“XXXXXXXX”)
# print(c1)
# print()
p1 = data[‘W1_J1_D’].value_counts(sort=False, normalize=True)
# p1 = p1.sort()
# print(sorted(p1))
print(p1)
print()
#
#
#
# # seaborn.distplot(data[‘W1_J1_D’].dropna(), kde=False)
# # plt.xlabel(‘Category 1 split into 5 groups’)
# # plt.title(‘tales’)
# seaborn.countplot(x=’W1_J1_D’, data=data)
# plt.xlabel(‘Number of people voted as 0 – bad proposal and 10 – very good proposal ‘)
# plt.title(‘Unmanaged data for Category 1 Require that an equal number of the top leadership positions in government go to women’)
# plt.show()
#
# # example recoding values
# # #recoding values for S3AQ3B1 into a new variable, USFREQ
# print(“data managed”)
#
# data[‘REV_W1_J1_D’] = data[‘REV_W1_J1_D’].convert_objects(convert_numeric=True)
recode1 = { 0:10, 1: 9, 2: 8, 3: 7, 4: 6, 5: 5, 6: 4, 7: 3, 8: 2, 9: 1, 10: 0}
# recode1 = {1: 9, 2: 8, 3: 7, 4: 6, 5: 5, 6: 4, 7: 3, 8: 2, 9: 1}
data[‘REV_W1_J1_D’] = data[‘W1_J1_D’].map(recode1)
#
# # data[‘REV_W1_J1_D’]=data[‘REV_W1_J1_D’].replace(-1, ‘refused’)
# # data[‘REV_W1_J1_D’] = data[‘REV_W1_J1_D’].convert_objects(convert_numeric=True)
# # data[‘REV_W1_J1_D’]=data[‘REV_W1_J1_D’].fillna(‘refused’, inplace=False)
# c1 = data[‘REV_W1_J1_D’].value_counts(sort=False, dropna=True)
#
# # data[‘REV_W1_J1_D’].astype(int)
# # c1 = data[‘REV_W1_J1_D’].sort_index(axis=0)
#
# print(c1)
# #Univariate histogram for quantitative variable:
# seaborn.distplot(data[‘REV_W1_J1_D’].dropna());
# plt.xlabel(‘Number of people voted as 0 – bad proposal and 10 – very good proposal’)
# plt.title(‘Require that an equal number of the top leadership positions in government go to women’)
# plt.show()
#
#
# p1 = data[‘REV_W1_J1_D’].value_counts(sort=False, normalize=True)
# # p1 = p1.sort()
# print(p1)
# seaborn.distplot(p1.dropna());
# plt.xlabel(‘number of people voted as 0 – bad proposal and 10 – very good proposal’)
# plt.title(‘Require that an equal number of the top leadership positions in government go to women’)
# plt.show()
#
# # #recoding values for S3AQ3B1 into a new variable, USFREQMO
# recode2 = {0: 10, 1: 9, 2: 8, 3: 7, 4: 6, 5: 5, 6: 4, 7: 3, 8: 2, 9: 1, 10: 0}
# data[‘REV_W1_J1_D’]= data[‘W1_J1_D’].map(recode1)

# # recoding values for S3AQ3B1 into a new variable, USFREQ
# recode1 = {1: 6, 2: 5, 3: 4, 4: 3, 5: 2, 6: 1}
# sub2[‘USFREQ’]= sub2[‘S3AQ3B1’].map(recode1)
#
# #recoding values for S3AQ3B1 into a new variable, USFREQMO
# recode2 = {1: 30, 2: 22, 3: 14, 4: 5, 5: 2.5, 6: 1}
# sub2[‘USFREQMO’]= sub2[‘S3AQ3B1’].map(recode2)

# quartile split 10 steps records into 4 categories
print (‘Category 1 – 4 categories – quartiles’)

data[‘Cat1_4’]=pd.qcut(data.REV_W1_J1_D, 4, labels=[“1=0%tile”,”2=25%tile”,”3=50%tile”,”4=75%tile”])

split = data[‘Cat1_4’].value_counts(sort=False, dropna=True)
# split = split.sort()
print (split)
print()
# quartile split 10 steps records into 5 categories
#
print(‘Category 1 split into 5 groups’)
data[‘Cat1_5’]=pd.cut(data.REV_W1_J1_D, 5, labels=[“1=0%tile”,”2=20%tile”,”3=40%tile”,”4=60%tile”,”5=80%tile”])
# data[‘Cat1_5’]=data[‘Cat1_5’].replace(10, numpy.nan)
split = data[‘Cat1_5′].value_counts(sort=False, dropna=True)
# split = split.sort_index()
print(split)
#
# # Plot examples
#
# # bivariate bar graph C->C
# # seaborn.factorplot(x=’ETHRACE2A’, y=’DAILY’, data=sub2, kind=”bar”, ci=None)
# # plt.xlabel(‘Ethnic Group’)
# # plt.ylabel(‘Proportion Daily Smokers’)
#
# # # bivariate bar graph C->Q
# # sub2[‘PACKCATEGORY’] = sub2[‘PACKCATEGORY’].convert_objects(convert_numeric=True)
# # # sub2[‘TAB12MDX’] = sub2[‘TAB12MDX’].convert_objects(convert_numeric=True)
# # seaborn.factorplot(x=”PACKCATEGORY”, y=”TAB12MDX”, data=sub2, kind=”bar”, ci=None)
# # plt.xlabel(‘Packs per Month’)
# # plt.ylabel(‘Proportion Nicotine Dependent’)
#
# #basic scatterplot: Q->Q
# # scat1 = seaborn.regplot(x=”urbanrate”, y=”internetuserate”, fit_reg=False, data=data)
# # plt.xlabel(‘Urban Rate’)
# # plt.ylabel(‘Internet Use Rate’)
# # plt.title(‘Scatterplot for the Association Between Urban Rate and Internet Use Rate’)
#
#
#
# #Univariate histogram for quantitative variable:
# # seaborn.distplot(data[‘Cat1_5’].dropna(), kde=False);
# # plt.xlabel(‘Category 1 split into 5 groups’)
# # plt.title(‘tales’)
#
#
# #basic scatterplot: Q->Q
# # scat1 = seaborn.regplot(x=”urbanrate”, y=”internetuserate”, fit_reg=False, data=data)
# # plt.xlabel(‘Urban Rate’)
# # plt.ylabel(‘Internet Use Rate’)
#
# #univariate bar graph for categorical variables
# # First hange format from numeric to categorical
# data[‘Cat1_5’] = data[‘Cat1_5’].astype(‘category’)
# seaborn.countplot(x=’Cat1_5′, data=data)
# plt.xlabel(‘”1=0%tile”,”2=20%tile”,”3=40%tile”,”4=60%tile”,”5=80%tile”‘)
# plt.title(‘Category 1 split into 5 groups’)
# plt.show()
#
# print()
# print(“————————————————————————————————————–“)
# print(“Category 2: Churches or places of worship should allow more women to become members of the clergy.”)
# print((len(data[‘W1_M4’])), “number of observations (number of people)”)
# print(“number of people voted as 1 – strongly agree and 4 – strongly disagree “)
# print(“interpretation – the biggest number of people somewhat agree. This does not agree on equality of ” \
# “women getting on this positions ” \
# “Also relatively large number of people refused to answer this question”)
#
# print(“data managed”)
data[‘W1_M4’]=data[‘W1_M4’].replace(-1, ‘refused’)
c2 = data[‘W1_M4′].value_counts(sort=False)
# c2 = c2.sort()
print(c2)
#
# seaborn.countplot(x=’W1_M4’, data=data)
# plt.xlabel(‘number of people voted as 1 – strongly agree and 4 – strongly disagree’)
# plt.title(‘More women should become members of the clergy’)
# plt.show()
#
#data[‘W1_M4’] = data[‘W1_M4’].value_counts(sort=True, normalize=True)
p2 = data[‘W1_M4’].value_counts(sort=True, normalize=True)
print(p2)
# seaborn.distplot(p2.dropna());
# plt.xlabel(‘dist – Number of people voted with 1 – strongly agree, 2 -somewhat agree, 3 – somewhat disagree and 4 – strongly disagree’)
# plt.title(‘More women should become members of the clergy’)
# plt.show()
#
#
# seaborn.countplot(x=’W1_M4′, data=data)
# plt.xlabel(‘Number of people voted with 1 – strongly agree, 2 -somewhat agree, 3 – somewhat disagree and 4 – strongly disagree’)
# plt.title(‘More women should become members of the clergy’)
# plt.show()
#
# print()
# print(“—————————————————————————————————————“)
# print(“Category 3: How concerned are you personally about women’s rights?”)
#
# print((len(data[‘W2_QF9’])), “number of observations (number of people)”)
# print(“number of people voted as 1 – strongly agree and 4 – strongly disagree “)
# print(“interpretation – the biggest number of people somewhat agree. This does not agree on equality of ” \
# “women getting on this positions ” \
# “Relatively large number of people refused to answer this question”)
#
# print(“original”)
c3 = data[‘W2_QF9’].value_counts(sort=False)
# c3 = c3.sort()
# print(c3)
p3 = data[‘W2_QF9’].value_counts(sort=False, normalize=True)
print(p3)
print()
print(“data managed”)
data[‘W2_QF9’]=data[‘W2_QF9’].replace(-1, ‘refused’)
#data[‘W2_QF9’]=data[‘W2_QF9’].fillna(‘missing’, inplace=False)
c3 = data[‘W2_QF9’].value_counts(sort=False)
print(c3)
p3 = data[‘W2_QF9’].value_counts(sort=False, normalize=True)
print(p3)
# seaborn.distplot(p3.dropna());
# plt.xlabel(‘dist – strongly agree, 2 -somewhat agree, 3 – somewhat disagree and 4 – strongly disagree’)
# plt.title(‘Category 3: How concerned are you personally about womens rights’)
# plt.show()
# print()
# print(“————————————————————————————————————–“)
# print(“Category 4:Discrimination against women is no longer a problem in the U.S”)
#
#
# print(” Discrimination against women is no longer a problem in the U.S. Do you agrees” \
# “people responded with 1 – Agree strongly to 5 disagree strongly and everyCategory in between “)
# print(“interpretation – 580 people somewhat disagree, which means that this is still an issue. ” \
# “Large number of records (693) are missing from this variable”)
# # data[‘W2_QH1’] = (pd.to_numeric(data[‘W2_QH1’]))
# # data[‘W2_QH1’] = data.W2_QH1.astype(numpy.float64)
# print(“original”)
# print((len(data[‘W2_QH1’])), “number of observations (number of people)”)
# #if you want to include a count of missing add ,dropna=False after sort=False
# c4 = data[‘W2_QH1’].value_counts(sort=False, dropna=False)
# # c4 = c4.sort()
# print(c4)
# p4 = data[‘W2_QH1’].value_counts(sort=False, normalize=True)
# print (p4)
# print()
# print(“data managed”)
#
# # data management does’t work if variable is not numeric
# # examples
# # sort by values
# # print c4a.value_counts(sort=True)
# # sort by variable
# # c4 = c4.sort()
# # convert to numeric – new method
# # c4 = pd.to_numeric((pd.to_numeric(data[‘W2_QH1′], errors=’coerce’)))
#
# # examples
# # replace missing values to NaN (this is how python represents missing data)
# # data[‘W2_QH1’]=data[‘W2_QH1’].replace(6, numpy.nan)
# #replace NaN with 11
# # data [‘VAR’] = sub2[‘S2AQ8A’].fillna(11, inplace=True)
# #data [‘W2_QH1’] = data[‘W2_QH1’].fillna(‘missing’, inplace=False)
# #data[‘W2_QH1’]=data[‘W2_QH1’].replace(-1, ‘refused’)
c4=data[‘W2_QH1’].value_counts(sort=False)
print((len(data[‘W2_QH1’])), “number of observations (number of people)”)
# c4 = c4.sort()
print(c4)
p4 = data[‘W2_QH1’].value_counts(sort=False, normalize=True)
print(p4)
# seaborn.distplot(p4.dropna());
# plt.xlabel(‘dist – 1 – Agree strongly to 5 disagree strongly and everyCategory in between ‘)
# plt.title(‘Category 4:Discrimination against women is no longer a problem in the U.S’)
# plt.show()

# print(“—————————————————————————————————————“)
# print()

# data[‘W2_QF9’] = data[‘W2_QF9’].value_counts(sort=False, dropna=True)
# data[‘W2_QH1’] = data[‘W2_QH1’].value_counts(sort=False, dropna=True)
# print(data[‘W2_QF9’])
# print(data[‘W2_QH1’])
#
# data[‘W2_QF9’] = data[‘W2_QF9’].convert_objects(convert_numeric=True)
# data[‘W2_QF9’] = data[‘W2_QF9’].astype(‘category’)
# # second create a new variable (PACKCAT) that has the new variable value labels
# # data[‘W2_QF9’]=data[‘W2_QF9’].cat.rename_categories([“Strontly Agree”, “Somewhat Agree”, “Somewhat Disagree”, “Strongly Disagree” “X”, “XX”])
# data[‘W2_QH1’] = data[‘W2_QH1’].convert_objects(convert_numeric=True)
# data[‘W2_QH1’] = data[‘W2_QH1’].astype(‘category’)
# # second create a new variable (PACKCAT) that has the new variable value labels
#
# # data[‘W2_QH1’]=data[‘W2_QH1’].cat.rename_categories([“Strontly Agree”, “Somewhat Agree”, “Somewhat Disagree”, “Strongly Disagree”, “X”, “XX”])
# print([‘W2_QF9′])
#
# # bivariate bar graph C->C

# works but not good for this type of data

# seaborn.factorplot(x=’W2_QF9′, y=’W2_QH1’, data=data.dropna(), kind=”bar”, ci=None)
# plt.xlabel(‘How concerned are you personally about women”s rights 1’)
# plt.ylabel(‘Do you agree discrimination against women is no longer a problem in the U.S?’)
# plt.show()
#
# # bivariate bar graph C->Q
# data[‘W2_QF9’] = data[‘W2_QF9’].convert_objects(convert_numeric=True)
# data[‘W2_QH1’] = data[‘W2_QH1’].convert_objects(convert_numeric=True)
#
#
# seaborn.factorplot(x=”W2_QH1″, y=”W2_QF9″, data=data, kind=”bar”, ci=None)
# plt.xlabel(‘Do you agree discrimination against women is no longer a problem in the U.S?’)
# plt.ylabel(‘How concerned are you personally about women”s rights’)
# plt.show()

# data[‘W2_QF9’] = data[‘W2_QF9’].value_counts(sort=False, dropna=True)
# data[‘W2_QH1’] = data[‘W2_QH1’].value_counts(sort=False, dropna=True)

# data[‘W2_QF9’] = data[‘W2_QF9’].convert_objects(convert_numeric=True)
# data[‘W2_QH1’] = data[‘W2_QH1’].convert_objects(convert_numeric=True)
#
# data[‘W2_QF9’]=data[‘W2_QF9’].fillna(‘6’, inplace=False)
# data[‘W2_QH1’]=data[‘W2_QH1’].fillna(‘6’, inplace=False)
#
# data[‘W2_QF9’] = data[‘W2_QF9’].value_counts(sort=False, normalize=True)
# data[‘W2_QH1’] = data[‘W2_QH1’].value_counts(sort=False, normalize=True)
# print(“data managed”)
# data[‘W2_QF9’]=data[‘W2_QF9’].replace(-1, ‘refused’)
# #data[‘W2_QF9’]=data[‘W2_QF9’].fillna(‘missing’, inplace=False)
# data[‘W2_QF9’]= data[‘W2_QF9’].value_counts(sort=False)
# data[‘W2_QH1’]= data[‘W2_QH1’].value_counts(sort=False)
# print(c3)
# p3 = data[‘W2_QF9’].value_counts(sort=False, normalize=True)
# print(p3)
#seaborn.set_context(“paper”, font_scale=2)
#seaborn.set_style(“white”)
# plt.rc(‘text’, usetex=False)
# fig, ax = plt.subplots(figsize=(4,4))
# B=[‘Category1′,’Category2′]
# ax.hist([p3,p4], histtype=’bar’, align=’mid’, label=B, alpha=0.4)#, rwidth=0.6)
# ax.set_xlabel(‘My label’)
# ax.get_yaxis().set_visible(False)
# ax.legend()
# plt.show()

# plt.rc(‘text’, usetex=False)
# fig, ax = plt.subplots(figsize=(4,4))
# B=[‘Category1′,’Category2′,’Category3′,’Category4′]
# ax.hist([p1,p2,p3,p4], histtype=’bar’, align=’mid’, label=B, alpha=0.4)#, rwidth=0.6)
# ax.set_xlabel(‘Aggregated Chart’)
# ax.get_yaxis().set_visible(True)
# ax.legend()
# plt.show()
#plt.rc(‘text’, usetex=False)
#fig, ax = plt.subplots(figsize=(10,10))
#B=[‘Category1′,’Category2′,’Category3′,’Category4′]
#ax.hist([split,c2,c3,c4], histtype=’bar’, align=’mid’, label=B, orientation = ‘horizontal’, alpha=1)#, rwidth=0.6)
#ax.set_xlabel(‘Aggregated Chart’)
#ax.get_yaxis().set_visible(True)
#ax.legend()
#plt.show()

#this works
#b = pd.DataFrame(data={
# ‘c2’: np.random.normal(size=(100,)),
# ‘c3’: np.random.lognormal(size=(100,)),
# ‘c4′: np.random.exponential(size=(100,))})
#print(b)
#seaborn.pairplot(b)

#works somewhat
#seaborn.set(style=”ticks”)
#
#rs = np.random.RandomState(11)
#x = rs.gamma(2, size=1000)
#y = -.5 * x + rs.normal(size=1000)
#
#seaborn.jointplot(x, y, kind=”hex”, stat_func=[split,c2,c3,c4], color=”#4CB391″)
#seaborn.set(style=”whitegrid”)
#
## Initialize the matplotlib figure
#f, ax = plt.subplots(figsize=(6, 15))

# Load the example car crash dataset
#crashes = seaborn.load_dataset(“car_crashes”).sort_values(“total”, ascending=False)
#print(total)
#
## Plot the total crashes
#seaborn.set_color_codes(“pastel”)
#seaborn.barplot(x=”total”, y=”abbrev”, data=crashes, label=”Total”, color=”b”)
#
## Plot the crashes where alcohol was involved
#seaborn.set_color_codes(“muted”)
#seaborn.barplot(x=”alcohol”, y=”abbrev”, data=crashes, label=”Alcohol-involved”, color=”b”)
#
## Add a legend and informative axis label
#ax.legend(ncol=2, loc=”lower right”, frameon=True)
#ax.set(xlim=(0, 24), ylabel=””,
# xlabel=”Automobile collisions per billion miles”)
#seaborn.despine(left=True, bottom=True)
print (“————————“)

print(c2)
print(c3)
#print(p4)
print(split)

n_groups = 5
means_2 = c2
means_3 = c3
#means_4 = c4
means_split = split

# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.1
opacity = 0.8

#rects3 = plt.bar(index + bar_width*2, means_split, bar_width,
# alpha=opacity,
# color=’y’,
# label=’Category1′)
#
#rects1 = plt.bar(index, means_2, bar_width,
# alpha=opacity,
# color=’b’,
# label=’Category2′)
#
#rects2 = plt.bar(index + bar_width, means_3, bar_width,
# alpha=opacity,
# color=’g’,
# label=’Category3′)

#rects3 = plt.bar(index + bar_width*2, means_split, bar_width,
# alpha=opacity,
# color=’y’,
# label=’XXX’)

rects1 = plt.bar(index, means_2, bar_width,
alpha=opacity,
color=’b’,
label=’women to become members of the clergy’)

rects2 = plt.bar(index + bar_width, means_3, bar_width,
alpha=opacity,
color=’g’,
label=’conserns about womens rights’)

#rects4 = plt.bar(index + bar_width + bar_width + bar_width, means_4, bar_width,
# alpha=opacity,
# color=’r’,
# label=’r’)

#rects4 = plt.bar(index + bar_width, means_guido, bar_width,
# alpha=opacity,
# color=’g’,
# label=’Guido’)

plt.xlabel(‘Person’)
plt.ylabel(‘Scores’)
plt.title(‘Bivariate graph’)
plt.xticks(index + bar_width, (‘1’, ‘2’, ‘3’, ‘4’, ‘refused’))
plt.legend()

plt.tight_layout()
plt.show()

#mpl_fig = plt.figure()
#ax = mpl_fig.add_subplot(111)
#
#N = 5
#menMeans = (20, 35, 30, 35, 27)
#womenMeans = (25, 32, 34, 20, 25)
#menStd = (2, 3, 4, 1, 2)
#womenStd = (3, 5, 2, 3, 3)
#ind = np.arange(N) # the x locations for the groups
#width = 0.35 # the width of the bars: can also be len(x) sequence
#
#p1 = ax.bar(ind, menMeans, width, color=(0.2588,0.4433,1.0))
#p2 = ax.bar(ind, womenMeans, width, color=(1.0,0.5,0.62),
# bottom=menMeans)
#ax.set_ylabel(‘Scores’)
#ax.set_xlabel(‘Groups’)
#ax.set_title(‘Scores by group and gender’)
#
#ax.set_xticks(ind + width/2.)
#ax.set_yticks(np.arange(0, 81, 10))
#ax.set_xticklabels((‘G1’, ‘G2’, ‘G3’, ‘G4’, ‘G5′))
#
#plotly_fig = tls.mpl_to_plotly( mpl_fig )
#
## For Legend
#plotly_fig[“layout”][“showlegend”] = True
#plotly_fig[“data”][0][“name”] = “Men”
#plotly_fig[“data”][1][“name”] = “Women”
#
#
#plot_url = py.plot(plotly_fig, filename=’stacked-bar-chart’)
#basic scatterplot: Q->Q
# scat1 = seaborn.regplot(x=”W2_QF9″, y=”internetuserate”, fit_reg=False, data=data)
# plt.xlabel(‘Urban Rate’)
# plt.ylabel(‘Internet Use Rate’)
# plt.title(‘Scatterplot for the Association Between Urban Rate and Internet Use Rate’)
#secondary variable multiplying the number of days smoked/month and the approx number of cig smoked/day
# sub2[‘NUMCIGMO_EST’]=sub2[‘USFREQMO’] * sub2[‘S3AQ3C1’]

# PPAGECAT

# recode missing values to python missing (NaN)
# data[‘W2_QH1’]=data[‘W2_QH1’].replace(9, numpy.nan)

# c2 = sub2[‘S3AQ3B1’].value_counts(sort=False, dropna=False)

#include a count of missing records
# sub2[‘S3AQ3C1’]=sub2[‘S3AQ3C1’].replace(99, numpy.nan)
#recode missing values to numeric value, in this example replace Nan with 11
# sub2[‘S2AQ8A’].fillna(11, inplace=True)
# #recoding values for S3AQ3B1 into a new variable, USFREQ
# recode1 = {1: 6, 2: 5, 3: 4, 4: 3, 5: 2, 6: 1}
# sub2[‘USFREQ’]= sub2[‘S3AQ3B1’].map(recode1)
# #subset data to young adults age 18 to 25 who have smoked in the past 12 months
# sub1=data[(data[‘AGE’]>=18) & (data[‘AGE’]<=25) & (data[‘W2_QF4A’]==1)]
#

#upper-case all DataFrame column names – place afer code for loading data aboave
# data.columns = map(str.upper, data.columns)
#
# # bug fix for display formats to avoid run time errors – put after code for loading data above
# pandas.set_option(‘display.float_format’, lambda x:’%f’%x)

Data Analysis and Interpretation Specialization