import pandas as pd #importing pandas package
import numpy as np #importing numpy package
data = pd.read_csv("https://data.smartcolumbusos.com/api/v1/dataset/dd66bd77-7fa2-4b57-9a21-8d1606430d5c/download?_format=csv")
data.shape #shape of the whole dataframe (row, column)
data
data.columns
data.dtypes
data.describe()
data.head() #showing first 5
data[:5] #showing from start to 5
data.tail() #showing last 5
data[-5:] #showing from last 5 to the end
data.loc[12] #loc is as way to identify information on the 4th row in the data
data.loc[12, 'deaths'] # what is the value for 'deaths' in the 4th row
data["sex"].unique()
data['sex'].value_counts()
encode_sex = pd.get_dummies(data["sex"], prefix="encode") #get_dummies does this automatically, prefix comes before each new column name
data = pd.concat([data, encode_sex], axis = 1) #concat concatenates the columns, axis 1 is horizontal, axis 0 is vertical concatenation
data.head()
data.loc[data['sex']=="Unknown", "Missing"] = "yes"
data.loc[data['sex']!="Unknown", "Missing"] = "no"
data.head()
data.loc[(data['age 11 categories'] == "< 1") | (data['age 11 categories'] == "1-4") | (data['age 11 categories'] == "5-14"), "age"] = "children"
otherAge = ('15-24', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+')
data.loc[data['age 11 categories'].isin(otherAge), "age"] = "old"
data[['age', 'age 11 categories']]
data['deaths'].describe() #describe gives descriptive statistics
pd.crosstab(data['age'], data['sex'])
import seaborn as sns # importing package 'seaborn'
import matplotlib.pyplot as plt # importing package matplotlib.pyplot
%matplotlib inline
#to show graph in notebook
sns.distplot(data["population"], bins=3) #plot a distribution plot for population
sns.countplot(data=data, x="age", order=("children", "old")) #plot countplot for dayofweek
sns.countplot(data=data, x="deaths")
sns.countplot(data=data, x="deaths", hue='sex')
sns.pointplot(data=data, x="age", y="deaths", order=("children", "old"))
sns.pointplot(data=data, x="age", y="deaths", hue ="sex", order=("children", "old"))
data.columns #look at all column names
data.to_csv("output.csv") #export out to csv file