[UCLA : Statistical Consulting Group] Introduction to R

https://stats.oarc.ucla.edu/r/seminars/intro/
Introduction to R Seminar
stats.oarc.ucla.edu
R을 꼼꼼히 배우려다 보니 여기 저기 제대로 쑤시고 있는데, 책뿐만 아니라 여기서 제공하는 사이트도 정말 좋았다. 예제 문제와 함께 연습할 수 있도록 여러 설명을 제공해 준다. 함수부터 차근차근 이어 나가면 데이터 사이언스 책이랑 같이 공부하면 되게 좋은 듯 하다 ㅎㅎ 원래 블로그에 코딩한거 올리려 했는데, 저작권 문제가 있다고 해서 그냥 배운 내용 인증 정도로 남겨보려고 한다!
Today I learned
# 220406 오후 9:48 coding start !

### Introduction to R 

## Installing packages
#install.packages() , dependencies = TRUE => to load all other packages required by the targeted package.

install.packages("dplyr", dependencies = TRUE) # 데이터 처리에 특화된 패키지
install.packages("ggplot2", dependencies = TRUE) # 그래프 그리기 위한 패키지 
install.packages("rmarkdown", dependencies = TRUE) # R 보고서 제작을 위한 패키지
install.packages("shiny", dependencies = TRUE) # 최신 웹 브라우저를 사용해 데이터 정리 가능한 패키지, 앱 개발 가능

## Loading packages
# library(), require()

library(ggplot2)
library(dplyr)
library(shiny)

## Vignettes  - longer, tutorial style guides for a package
# list all available vignettes
vignette()
vignette("dplyr")




### Basic R coding 
# coding
a <- "hello"
2+
    3

# functions and help files
?log

# function arguments
log(x=100 ,base=10) # specifying arguments by name 지정해줌.
log(8,2) # specifying arguments by position 

# Vectors
# one dimensional and homogeneous

# Creating vectors
first_vec <- c(1,3,5)
first_vec

char_vec <- c("these", "are", "some", "words")
length(char_vec)

first_vec < c(2,2,2) # the result of this comparison is a logical vector

# first argument to rep is what to repeat and the second argument is number of repetitions
rep(0, times = 3)
rep("abc", 4)

# arguments for seq are from, to, by
seq(from=1, to=5, by=2)
seq(10,0,-5)

#colon operator
3:7

# you can nest functions 
rep(seq(1,3,1), times =2)

# example :Create the vector (4,5,6) in three different ways using c(), seq(), and the : operator.Try creating the vector (2,2,1,1) in at least two different ways.

v1 <- c(4,5,6)
v2 <- 4:6
v3 <- seq(4,6,1)


# Subsetting vectors with []
(a <- seq(10,1,-1)) # putting () around a command will cause the result to be printed
a[2]
a[seq(1,5)]
a[c(1,3,4)]

# example : Create the vector y as the integers counting down from 10 to 1. Extract the second, fifth, and seventh element of this vector y.

y <- 10:1
y[c(2,5,7)]

# conditional selection - subsetting by value
scores <- c(55,24,43,10)
scores[c(F,T,T,F)]
scores <30 # logical vectors로 나오는 것을
scores[scores<30] # specifying 하는 곳에 넣어서 use to subset 가능

# example : Use conditional selection to find the numbers in y (integers from 10 to 1) that when multiplied by 2, the result is greater than 15.

y[y*2 > 15]




### Importing and Exporting Data 

## Dataset files 
## Reading in text data
# read.csv(), read.delim(), header = FALSE
data <- read.csv(path.csv)
dat.tab <- read.delim(path, sep = "\t")

# example : Create a dataset called dat_csv by loading a dataset from our server at this address: https://stats.idre.ucla.edu/stat/data/hsbraw.csv .
dat_csv <- read.csv("https://stats.idre.ucla.edu/stat/data/hsbraw.csv")

## Exporting Data
# write.csv() , save(), load()

write.csv(data, file = ".csv") # csv로 저장할게 
save(data, mydata, file = "path.csv") # data를 mydata라는 R데이터로 저장할게

# packages for importing 
# readxl - excel files 
# haven - stata, asa, spss




### Data Frames
## Data frames
# read.csv(), read.table()
# rectangular, where the columns are variables and the rows are observations of those variables.
# diff data types , but must be equal length
# two dimensional , heterogeneous, rectangular

## viewing data as a preadsheet with View(), head() and tail()
View(dat_csv)

head(dat_csv,2)
tail(dat_csv,2)

## subsetting data frames
# [rows, columns]

mydata <- data.frame(patient = c("Smith", "Jones", "Williams"),
                     height = c(72,61,66),
                     diabetic = c(TRUE, FALSE, FALSE))

mydata[3,2]
mydata[1:2, "height"]
mydata[, "diabetic"]

# example : Extract the 2nd, 5th, and 10th rows of the variable math in the dat_csv data set.
dat_csv[c(2,5,10), "math"]

# selected column vector using []
mydata$height
mydata$height[2:3]

#example : Extract the 2nd, 5th, and 10th rows of the variable math in the dat_csv data set using the $ operator
dat_csv$math[c(2,5,10)]

## Naming data frame columns
# colnames(data_frame) <- c("some", "names")

colnames(mydata)
colnames(mydata) <- c("Patient", "Height", "Diabetic")
colnames(mydata)

colnames(mydata)[3] <- "Diabetes"
colnames(mydata)

## Examining the structure of an object
# dim() , on two dimensional objects to get the number of rows and columns
# str() , to see the structure of the object, class and the data types of elements 

dim(mydata)
str(mydata)

# example : Examine the structure of dat_csv with str().
str(dat_csv)

## Adding new variables to the data frame 
# column 추가할때 length 고려하기

mydata$logHeight <- log(mydata$Height)
colnames(mydata)
mydata$z <- rep(0,5)

# example : Create a data set called test3 that is all rows of the 3 column variables math, read, and write from dat_csv.
test3 <- data.frame(dat_csv$math, dat_csv$read, dat_csv$write)
test3

colnames(test3) <- c("Math", "Read", "Write")
colnames(test3)

# Add a variable to test3 called test_mean that is the mean of the variables math, read, and write. Specify the data.frame test3 as the only argument to rowMeans().
test3$test_mean <- rowMeans(test3, c(test3$Math, test3$Read, test3$Write))
test3$test_mean <- rowMeans(test3[, c(test3$Math, test3$Read, test3$Write)])
colnames(test3)

# Use head() to look at the first 5 rows of test3.
head(test3, 5)





### Data Management 
## Preparing data for analysis 
# load packages 
library(dplyr)

## Subsetting rows of a data frame with filter()


dog_data <- data.frame(id = c("Duke", "Lucy", "Buddy", "Daisy", "Bear", "Stella"),
                       weight = c(25, 12, 58, 67, 33, 9),
                       sex=c("M", "F", "M", "F", "M", "F"),
                       location=c("north", "west", "north", "south", "west", "west"))

# dogs weighing more than 40 
filter(dog_data, weight > 40)
# female dogs in the north or south locations
filter(dog_data, (location == "north" |location == "south" ) & sex == "F")

# example : Create a data set from dat_csv called low_read that contains observations where the read score is less than or equal to 50.

low_read <- filter(dat_csv, (read < 50 | read == 50))
head(low_read,3)

# Create a data set from dat_csv called mid_read that contains observations where the read score is greater than 50 but also less than or equal to 60.
mid_read <- filter(dat_csv, (read < 60 | read == 60) & read > 50 )
head(mid_read,3)

## Subsetting variables (columns)
# select()

select(dog_data, id, sex)
select(dog_data, -c(id, sex)) #unselect

#example : Create a data set called high_read_in that is just the id and read variables for observations where read is greater than 60. Create another data set called high_read_out that is all of the other variables besides read (include id in both data sets) for the same observations with read greater than 60.

read60 <- filter(dat_csv, read > 60) # 이게 지금 새로운 행 추출을 통해서 새로운 데이터 셋 만든게 되는 거임 

high_read_in<- select(read60, c(id, read))
high_read_out <- select(read60, -c(read))

## Appending observations (appending by rows)
# rbind()

more_dogs <- data.frame(id = c("Jack", "Luna"),
                        weight=c(38, -99),
                        sex=c("M", "F"),
                        location=c("east", "east"))

names(dog_data)
colnames(dog_data)

names(more_dogs)

all_dogs <- rbind(dog_data, more_dogs)
all_dogs

#example: Append low_read and mid_read and call the resulting data set low_and_mid_read. Check in the Environment pane that low_and_mid_read has the correct number of observations.

low_and_mid_read <- rbind(low_read,mid_read)
low_and_mid_read

## Adding data columns by merging on a key variable
# inner_join() , merge() , by= ( to control it )
dog_vax <- data.frame(id = c("Luna", "Duke", "Buddy", "Stella", "Daisy", "Lucy", "Jack", "Bear"),
                      vaccinated = c(TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE))

dogs <- inner_join(all_dogs, dog_vax)
dogs

#example : Merge high_read_in and high_read_out and call it high_read. Append high_read to low_and_mid_read and call it all_read. Check in the Environment pane that all_read and dat_csv are the same size.

high_read <- inner_join(high_read_in,high_read_out)
all_read <- rbind(high_read,low_and_mid_read)


## Missing values - NA
# blank field 

dogs$weight[dogs$weight == -99] <- NA
dogs$weight

# Missing values are contagious
1 + 2 +NA
c(1,2,3,NA) >2
dogs$weight
mean(dogs$weight)

#na.rm = TRUE => it will first remove any NA values from the operation before calculating the result

sum(c(1,2,NA), na.rm = TRUE)
mean(dogs$weight, na.rm = TRUE)

#example :In dat_csv, the variable science contains -99 values to signify missing. How can you identify which rows have -99 values? Convert all of these -99 values to NA. Calculate the mean of science ignoring the missing values.

dat_csv[(dat_csv$science == -99),]
dat_csv$science[dat_csv$science == -99] <- NA
mean(dat_csv$science, na.rm = TRUE)


### Basic Data Analysis 
## Descriptive statistics for continuous variables 

bloodtest <- data.frame(id = 1:10,
                        gender = c("female", "male", "female", "female", "female", "male", "male", "female", "male", "female"),
                        hospital = c("CLH", "MH", "MH", "MH", "CLH", "MH", "MDH", "MDH", "CLH", "MH"),
                        doc_id = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 3),
                        insured = c(0, 1, 1, 1, 0, 1, 1, 0, 1, 1),
                        age = c(23, 45, 37, 49, 51, 55, 56, 37, 26, 40),
                        test1  = c(47, 67, 41, 65, 60, 52, 68, 37, 44, 44),
                        test2 = c(46, 57, 47, 65, 62, 51 ,62 ,44 ,46, 61),
                        test3 = c(49, 73, 50, 64, 77, 57, 75, 55, 62, 55),
                        test4 = c(61, 61, 51, 71, 56, 57, 61, 46, 46, 46))

mean(bloodtest$age)
median(bloodtest$age)
var(bloodtest$age)

summary(bloodtest$age)

## Correlations 
# provide quick assessments of whether two continuous variables are linearly related to one another. 
# cor(), depending on variables it can be called as single correlation and correlation matrix.

cor(bloodtest$test1, bloodtest$test2)
scores <- select(bloodtest, test1, test2, test3, test4)
cor(scores)

# example :Create a correlation table of the dat_csv variables read, write, math, science, and socst.

dat_var <- select( dat_csv,read, write, math, science, socst)
cor(dat_var)

## Frequency Tables 
# 범주형 변수에는 mean이나 median 등이 의미가 없을 것이다. 그래서 빈도수 테이블을 통해 각각의 범주의 분포를 알아본다. 
#table() , prop.table() => 확률로 

table(bloodtest$gender)
table(bloodtest$hospital)
prop.table(table(bloodtest$hospital))


## Crosstabs
# two way and multi way frequency tables are used to explore the relationships btw categorical variables
# margin = 1 행, margin = 2 열

my2way <- table (bloodtest$gender, bloodtest$hospital)
my2way

prop.table(my2way, margin = 1)
prop.table(my2way, margin = 2)

# example : Determine the proportion of each socio-economic group (variable ses) within each school type (variable schtyp) in the dat_csv data set.

prop.table(table(dat_csv$ses, dat_csv$schtyp))

## Statistical analysis in R
library(stats)

## Chi-square test of independence 
# association btw two categorical variables 
# chisq.test() - 범주형, 하나의 확률이 다른 확률에 영향을 주는가?

chisq.test(bloodtest$hospital, bloodtest$insured)

## independent sample t tests
# t.test()
# whether test1 score is associated with gender
t.test(test1 ~ gender, data = bloodtest) #Test1 score does not appear to differ between the genders

# example : Perform a t-test to determine whether math scores are different between genders (variable female) with data set dat_csv.

t.test(math ~ female, data = dat_csv)


## Paired samples t test
#  the means of two possibly correlated variables are different
# paired test 에는 ~ 사용하지 않고 paired = TRUE
t.test(bloodtest$test1, bloodtest$test3, paired=TRUE) # The paired t-test suggests that test3 scores are significantly different from test1 scores.

## Linear regression 
# expands the simple predictor-outcome model of t-tests by allowing more predictors
# lm() 
m1 <- lm()
m1

## model objects and extractor functions
summary(m1)

# example : Peform a linear regression of the outcome read with predictors math, female, and ses using dat_csv. Call the model object m1. Interpret your results.

m1 <- lm(read ~ math + female + ses, data = dat_csv)
m1

coef(m1)

confint(m1)

# cbind joins column vectors into a matrix 
cbind(bloodtest$test1, predict(m1), residuals(m1))


## ANOVA 
anova(m1)
m2 <- lm(test1 ~ age + gender + hospital, data=bloodtest)
anova(m2,m1)
# Hospital does not appear to improve the fit of the model significantly, so we would typically choose m1, the more parsimoniuous model.

# example
m1 <- lm(read ~ math + female + ses, data = dat_csv)
m3 <- lm(read ~ math + female + ses + prog, data = dat_csv)
anova(m3,m1)

## Regression diagnostics
layout(matrix(c(1,2,3,4),2,2))
plot(m1)

## logistic regression- how variation in a binary outcome can be explained by a set of predictors




### Graphics

## Scatter plots  - plot()
layout(1,1,1)
plot(bloodtest$test1, bloodtest$test2)

# grouping variables by making them as factor
bloodtest$gender <- factor(bloodtest$gender)
plot(bloodtest$test1, bloodtest$test2, col = bloodtest$gender) # color by gender
plot(bloodtest$test1, bloodtest$test2,
     col = bloodtest$gender, 
     pch = 17)
# adding labels 
plot(bloodtest$test1, bloodtest$test2,
     col = bloodtest$gender, 
     pch = 17,
     xlab = "Test 1",
     ylab = "Test 2",
     main = "Plot of Test1 vs Test2") 
# adding a legend
plot(bloodtest$test1, bloodtest$test2,
     col = bloodtest$gender, 
     pch = 17)
legend("topleft", legend = levels(bloodtest$gender), col = bloodtest$gender, pch = 17)
?pch
# example: Create a scatter plot of read (x-axis) vs write (y-axis), using filled square symbols, colored by the variable prog.

dat_csv$prog <- factor(dat_csv$prog)
plot(dat_csv$read, dat_csv$write,
     col = dat_csv$prog,
     pch = 15)

## Histograms - the distributions of continuous variables
hist(bloodtest$test1, breaks = 2) # 히스토그램 짜르기 

## Boxplots - compare the distribution of a continuous variable across the levels of a categorical variable
boxplot(bloodtest$test2 ~ bloodtest$insured)

boxplot(bloodtest$test2 ~ bloodtest$insured,
        xlab = "Insured",
        ylab = "Test 2",
        main = "Boxplots of Test2 by Insurance Status",
        col = "lightblue")

## Barplots -  to visualize the frequencies of levels of grouping variables, where the height of the bar represents the number of observations falling into that grouping

tab <- table(bloodtest$gender, bloodtest$hospital)
barplot(tab)
# adding a legend of a barplot is easy 
barplot(tab,
        legend.text = TRUE)
# side-by-side bars
barplot(tab,
        legend.text = TRUE,
        beside = TRUE,
        col = c("lawngreen", "sandybrown"),
        xlab = "Hospital",
        ylab = "Frequency",
        main = "Frequencies of gender by hospital")

# example : Create a bar plot of ses by prog in the data set dat_csv. Use the colors red, green, and blue to color the bars. Add a legend.

dt <- table(dat_csv$ses, dat_csv$prog)
barplot(dt,
        legend.text = TRUE,
        col = c("red", "green", "blue"))


## Introducing ggplot2 for graphics
library(ggplot2)

## Basic Syntax of a ggplot2 plot 

# aspect => aes , shape => geom
# ggplot(dataset, aes(x = xvar, y = yvar)) + geom_function()
ggplot(data = dat_csv , aes(x=math, y=write))+
    geom_point()
# best_fit_regression line
ggplot(dat_csv, aes(x=math, y=write)) +
    geom_point() +
    geom_smooth(method = "lm")
ggplot(dat_csv, aes(x=math, y=write, color = female, fill = female))+
    geom_point()+
    geom_smooth(method = "lm")

ggplot(dat_csv, aes(x=math, y=write, color = female, fill = female))+
    geom_point()+
    geom_smooth(method = "lm") +
    facet_wrap(~prog) # prog에 따라 그림을 나눔 

## Example 2 for ggplot2

#boxplots of math by prog 
ggplot(data = dat_csv, aes(x=prog, y=math, fill = schtyp))+
    geom_boxplot() +
    geom_jitter(width = .05) #see actual data , width 설정함으로써 모아줌

ggplot(data = dat_csv, aes(x=prog, y=math, fill = schtyp))+
    geom_boxplot() +
    geom_jitter(width = .05) +
    theme_dark()

ggplot(data = dat_csv, aes(x=prog, y=math, fill = schtyp))+
    geom_boxplot() +
    geom_jitter(width = .05)+
    theme(panel.background = element_blank(),
          panel.grid = element_blank(),
          axis.ticks = element_blank(),
          text = element_text(family = "serif"))

### ggplot2 exercise
# Use ggplot2 to make a scatter plot of read (x-axis) vs write (y-axis) from data set dat_csv. Color the dots by ses.
# Now add a best fit line with geom_smooth(). Why are there 3 lines?

ggplot(data = dat_csv, aes(x= read, y=write, col = ses))+
    geom_point()+
    geom_smooth()

### Sharing your work 
##Rmarkdown 
barplot(HairEyeColor[,,1],
        col=c("#4d4d4d", "#bf812d", "#f4a582", "#f6e8c3"),
        legend.text=TRUE, xlab="Eye Color", 
        args.legend=list(title="Hair Color"))

library(shiny)

runExample("01_hello")
runExample("06_tabsets")

# 오전 12:35 끝! shiny 좀 더 공부해보고 싶다 :)
저작자표시 (새창열림)
'Data Analysis > R' 카테고리의 다른 글

[R for Data Science] 3 Data visualization (0)	2022.05.14
[Machine Learning with R] Managing and Understanding Data Part.2 (0)	2022.03.24
[Machine Learning with R] Managing and Understanding Data Part.1 (0)	2022.03.24
매운 블로그

[UCLA : Statistical Consulting Group] Introduction to R

'Data Analysis > R' 카테고리의 다른 글

댓글

티스토리툴바

[UCLA : Statistical Consulting Group] Introduction to R

'Data Analysis > R' 카테고리의 다른 글

관련글

댓글

티스토리툴바