What is R?
R is an object-oriented programming language that is designed for data analysis. To do with R programming, a person must have knowledge of programming, however, a person naive to programming can get on with R.
Download R and R Studio
Download R: Windows | Mac | Linux
Download R Studio
Some few concepts of object-oriented programming language:
1. Class: Class is a blueprint of an object. The class describes the content of the object that belongs to it. The class defines the overall data structure.
2. Object: Object can be anything from numbers to characters to DateTime. The nature of an object is defined by its class. The objects have the behavior of their class.
3. Polymorphism: Polymorphism is the ability of an object to take on many forms. The most common use of polymorphism in OOP occurs when a parent class reference is used to refer to a child class object (Source: Tutorialspoint.com).
4. Inheritance: Inheritance is a process of defining a new class based on an existing class by extending its common data members and methods.
R programming is not as complex as Java programming or C++. We use packages in R to make our work simple.
Data types in R
1. Scalar: Scalar is a number or a character.
2. Vector: Vector is a combination of numbers or characters.
3. Dataframe: Dataframe is the combination of vectors.
4. Matrix: A matrix is a rectangular array of numbers (or other mathematical objects) for which operations such as addition and multiplication are defined.
5. List: The list is the object which contains elements of different types – like strings, numbers, vectors, and another list inside it.
Defining Object in R
Everything in R can be an object. We shall give a name to an object and assign value to it. The value of an object can be a number, a vector, a matrix, a dataframe, a list, ...
object_name <- 4 #It is an example of Scalar
assign('object_name',4)
Here, 4 is assigned to an object named 'object_name'.
object_name1 = c(4,5,6) #It is an example of Vector
Here, c = combine; and 4,5,6 are assigned to object named 'object_name1'.
<- and = work same but <- (alt + -) is used frequently.
Setting working directory in R
getwd() # see the working directory
#setwd("directory_path")
#For example:
setwd("D:/~~~SPSS session/Materials") #replace \ with /.
Survey Data Analysis in R begins...
#show working directory
getwd()
#setting working directory
setwd("D:/~~~SPSS session/Materials")
#importing library
#installing package
#If not installed intall it by removing '#' in line 8.
#install.packages("readxl") #if not installed install it.
#Use library
library(readxl)
#import excel dataset
sudal_dataset <- read_excel("2077.06.18 SPSS dataset sudal.xlsx", sheet = "Data")
#Note:
#1. sudal_dataset is an object and this object contains the imported data
#2. read_excel("name of datafile.xlsx", sheet="Name of sheet")
#view dataset
View(sudal_dataset)
#display variable name
names(sudal_dataset)
#show number of variables
length(sudal_dataset)
#show number of observations of variable Household
length(sudal_dataset$SN)
#Number of variables and observations
dim(sudal_dataset)
rownames(sudal_dataset)
#Show variable names
colnames(sudal_dataset)
#display first 6 observations
head(sudal_dataset)
#show first 20 observations
head(sudal_dataset, n=20)
#Data structure
str(sudal_dataset)
View(sudal_dataset$Household)
#dealing with missing values
is.na(sudal_dataset)
fix(sudal_dataset)
sum(is.na(sudal_dataset))
#recode values of variable gender such that "Female = 0" instead of "Female =2"
#command
#dataset$variable_name[dataset$variable_name==old_value] <- new_value
sudal_dataset$Gender[sudal_dataset$Gender==2] <- 0
#there is missing values
na.omit(sudal_dataset)
#labeling variable values
#labeling variable Gender
#command
#dataset$variable_name <- factor(dataset$variable_name,levels=c(level1,level2,...),labels=c("name1","name2",...))
sudal_dataset$Gender <- factor(sudal_dataset$Gender,levels = c(0,1),labels = c("Female","Male"))
summary(sudal_dataset$Gender)
#labeling variable Area
sudal_dataset$Area <- factor(sudal_dataset$Area,levels = c(1,2),labels = c("Sudal","Koteshwor"))
summary(sudal_dataset$Area)
#labeling variable Education
sudal_dataset$Education <- factor(sudal_dataset$Education,levels = c(1,2,3,4,5,6),labels = c("Never attended school","Attended school","SLC","Intermediate","Bachelors","Masters"))
summary(sudal_dataset$Education)
#labeling variable Employment
sudal_dataset$Employment <- factor(sudal_dataset$Employment,levels = c(1,2),labels = c("Employed","Unemployed"))
summary(sudal_dataset$Employment)
#label variable family type
sudal_dataset$`Family type` <- factor(sudal_dataset$`Family type`,levels = c(1,2),labels = c("Nuclear","Joint"))
summary(sudal_dataset$`Family type`)
#label variable ethnicity
sudal_dataset$Ethnicity <- factor(sudal_dataset$Ethnicity,levels = c(1,2,3,4,5,6,7,8,9,10),labels = c("Kami","Brahmin","Newar","Chettri","Sunuwar","Magar","Thing","Tharu","Madhesi","Tamang"))
summary(sudal_dataset$Ethnicity)
#recode variable sector "4" as "3"
#Command
#dataset$variable_name[dataset$variable_name==old_value] <- new_value
sudal_dataset$Sector[sudal_dataset$Sector==4] <- 3
#label variable Sector
sudal_dataset$Sector <- factor(sudal_dataset$Sector,levels = c(1,2,3,4),labels = c("Agriculture","Business","Service","Foreign Employment"))
#label variable Occupation
sudal_dataset$Occupation <- factor(sudal_dataset$Occupation,levels = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15),labels = c("Tailor","Farmer","Blacksmith","Police","Guard","Self-owned business","Beautician","Teacher","Driver","Cook","Training Center","Staff","Foreign Employment","Labour","Government Servent"))
#Creating tables
mytable <- table(sudal_dataset$Gender,sudal_dataset$Ethnicity)
mytable #print table
margin.table(mytable,1) # A frequencies summed over B
margin.table(mytable,2) # B frequencies summed over A
prop.table(mytable) # Cell percentage
prop.table(mytable,1) # Row percentage
prop.table(mytable,2) # Column percentage
#Save table
write.table(mytable,file = "table_1.csv",sep = ",",quote = FALSE,row.names = T)
#We use attach command to attach database in 'R'
attach(sudal_dataset)
mytable1 <- table(Gender,Ethnicity)
mytable1
write.table(mytable1,file = "table_2.csv",sep = ",",quote = FALSE,row.names = T,col.names = T)
#Creating charts
#box plot
#command
#boxplot(variable_name(numerical)~variable_name(categorical),main="Chart Title")
boxplot(Food_today,main="Box plot")
boxplot(Food_today~Employment,main="Box plot of Food expenses by Gender")
#Scatter plot
#Command
#plot(variable_name1~variable_name2)
plot(Food_today~`Total Income`)
#Using ggplot2 for attractive graphs
install.packages("ggplot2")
install.packages("ggthemes")
library(ggplot2)
library(ggthemes)
#box plot
boxchart <- ggplot(sudal_dataset,aes(x=Gender,y=Food_today,fill=Employment))+
geom_boxplot(outlier.colour="red", outlier.shape=8,outlier.size=4)+
stat_boxplot(geom = 'errorbar')
boxchart_final <- boxchart+ggtitle("Box plot of Food expenditure by Gender clustered by Employment")+
xlab("Gender")+ylab("Food expenses")
boxchart_final
#Flip the box plot
boxchart_final+coord_flip()
#Bar chart
barchart <- ggplot(sudal_dataset,aes(x=Gender,y=Food_today,fill=Area))+
geom_bar(stat = "identity")
barchart
#Pie chart
pie <- ggplot(sudal_dataset,aes(x=Food_today,y=Gender,color=Gender))+
geom_bar(stat = "identity",width = 1)+
coord_polar("y",start=0)
pie
#Scatter plot
scatterplot <- ggplot(sudal_dataset,aes(x=Food_today/10^6,y=`Total Income`/10^6,color=Gender,linetype=Gender))+
geom_point(size=2.5)+
geom_smooth(method = lm,se=F)+
facet_wrap(~Employment,nrow=2)+
scale_x_continuous(trans = "log10")+
scale_y_continuous(trans = "log10")+
ggtitle("Food expenditure and Total income by Gender grouped by Employment")+
xlab("Food expenditure (Rs.) (log(10))")+
ylab("Total income (Rs.) (log(10))")+
theme_economist()
scatterplot
#Save scatterplot in working directory
jpeg("scatterplot.jpg",width = 650,height = 450)
scatterplot
dev.off()
To be continued...
Post a Comment