https://stats.oarc.ucla.edu/r/seminars/intro/
Introduction to R Seminar
stats.oarc.ucla.edu
R을 꼼꼼히 배우려다 보니 여기 저기 제대로 쑤시고 있는데, 책뿐만 아니라 여기서 제공하는 사이트도 정말 좋았다. 예제 문제와 함께 연습할 수 있도록 여러 설명을 제공해 준다. 함수부터 차근차근 이어 나가면 데이터 사이언스 책이랑 같이 공부하면 되게 좋은 듯 하다 ㅎㅎ 원래 블로그에 코딩한거 올리려 했는데, 저작권 문제가 있다고 해서 그냥 배운 내용 인증 정도로 남겨보려고 한다!
Today I learned
# 220406 오후 9:48 coding start !
### Introduction to R
## Installing packages
#install.packages() , dependencies = TRUE => to load all other packages required by the targeted package.
install.packages("dplyr", dependencies = TRUE) # 데이터 처리에 특화된 패키지
install.packages("ggplot2", dependencies = TRUE) # 그래프 그리기 위한 패키지
install.packages("rmarkdown", dependencies = TRUE) # R 보고서 제작을 위한 패키지
install.packages("shiny", dependencies = TRUE) # 최신 웹 브라우저를 사용해 데이터 정리 가능한 패키지, 앱 개발 가능
## Loading packages
# library(), require()
library(ggplot2)
library(dplyr)
library(shiny)
## Vignettes - longer, tutorial style guides for a package
# list all available vignettes
vignette()
vignette("dplyr")
### Basic R coding
# coding
a <- "hello"
2+
3
# functions and help files
?log
# function arguments
log(x=100 ,base=10) # specifying arguments by name 지정해줌.
log(8,2) # specifying arguments by position
# Vectors
# one dimensional and homogeneous
# Creating vectors
first_vec <- c(1,3,5)
first_vec
char_vec <- c("these", "are", "some", "words")
length(char_vec)
first_vec < c(2,2,2) # the result of this comparison is a logical vector
# first argument to rep is what to repeat and the second argument is number of repetitions
rep(0, times = 3)
rep("abc", 4)
# arguments for seq are from, to, by
seq(from=1, to=5, by=2)
seq(10,0,-5)
#colon operator
3:7
# you can nest functions
rep(seq(1,3,1), times =2)
# example :Create the vector (4,5,6) in three different ways using c(), seq(), and the : operator.Try creating the vector (2,2,1,1) in at least two different ways.
v1 <- c(4,5,6)
v2 <- 4:6
v3 <- seq(4,6,1)
# Subsetting vectors with []
(a <- seq(10,1,-1)) # putting () around a command will cause the result to be printed
a[2]
a[seq(1,5)]
a[c(1,3,4)]
# example : Create the vector y as the integers counting down from 10 to 1. Extract the second, fifth, and seventh element of this vector y.
y <- 10:1
y[c(2,5,7)]
# conditional selection - subsetting by value
scores <- c(55,24,43,10)
scores[c(F,T,T,F)]
scores <30 # logical vectors로 나오는 것을
scores[scores<30] # specifying 하는 곳에 넣어서 use to subset 가능
# example : Use conditional selection to find the numbers in y (integers from 10 to 1) that when multiplied by 2, the result is greater than 15.
y[y*2 > 15]
### Importing and Exporting Data
## Dataset files
## Reading in text data
# read.csv(), read.delim(), header = FALSE
data <- read.csv(path.csv)
dat.tab <- read.delim(path, sep = "\t")
# example : Create a dataset called dat_csv by loading a dataset from our server at this address: https://stats.idre.ucla.edu/stat/data/hsbraw.csv .
dat_csv <- read.csv("https://stats.idre.ucla.edu/stat/data/hsbraw.csv")
## Exporting Data
# write.csv() , save(), load()
write.csv(data, file = ".csv") # csv로 저장할게
save(data, mydata, file = "path.csv") # data를 mydata라는 R데이터로 저장할게
# packages for importing
# readxl - excel files
# haven - stata, asa, spss
### Data Frames
## Data frames
# read.csv(), read.table()
# rectangular, where the columns are variables and the rows are observations of those variables.
# diff data types , but must be equal length
# two dimensional , heterogeneous, rectangular
## viewing data as a preadsheet with View(), head() and tail()
View(dat_csv)
head(dat_csv,2)
tail(dat_csv,2)
## subsetting data frames
# [rows, columns]
mydata <- data.frame(patient = c("Smith", "Jones", "Williams"),
height = c(72,61,66),
diabetic = c(TRUE, FALSE, FALSE))
mydata[3,2]
mydata[1:2, "height"]
mydata[, "diabetic"]
# example : Extract the 2nd, 5th, and 10th rows of the variable math in the dat_csv data set.
dat_csv[c(2,5,10), "math"]
# selected column vector using []
mydata$height
mydata$height[2:3]
#example : Extract the 2nd, 5th, and 10th rows of the variable math in the dat_csv data set using the $ operator
dat_csv$math[c(2,5,10)]
## Naming data frame columns
# colnames(data_frame) <- c("some", "names")
colnames(mydata)
colnames(mydata) <- c("Patient", "Height", "Diabetic")
colnames(mydata)
colnames(mydata)[3] <- "Diabetes"
colnames(mydata)
## Examining the structure of an object
# dim() , on two dimensional objects to get the number of rows and columns
# str() , to see the structure of the object, class and the data types of elements
dim(mydata)
str(mydata)
# example : Examine the structure of dat_csv with str().
str(dat_csv)
## Adding new variables to the data frame
# column 추가할때 length 고려하기
mydata$logHeight <- log(mydata$Height)
colnames(mydata)
mydata$z <- rep(0,5)
# example : Create a data set called test3 that is all rows of the 3 column variables math, read, and write from dat_csv.
test3 <- data.frame(dat_csv$math, dat_csv$read, dat_csv$write)
test3
colnames(test3) <- c("Math", "Read", "Write")
colnames(test3)
# Add a variable to test3 called test_mean that is the mean of the variables math, read, and write. Specify the data.frame test3 as the only argument to rowMeans().
test3$test_mean <- rowMeans(test3, c(test3$Math, test3$Read, test3$Write))
test3$test_mean <- rowMeans(test3[, c(test3$Math, test3$Read, test3$Write)])
colnames(test3)
# Use head() to look at the first 5 rows of test3.
head(test3, 5)
### Data Management
## Preparing data for analysis
# load packages
library(dplyr)
## Subsetting rows of a data frame with filter()
dog_data <- data.frame(id = c("Duke", "Lucy", "Buddy", "Daisy", "Bear", "Stella"),
weight = c(25, 12, 58, 67, 33, 9),
sex=c("M", "F", "M", "F", "M", "F"),
location=c("north", "west", "north", "south", "west", "west"))
# dogs weighing more than 40
filter(dog_data, weight > 40)
# female dogs in the north or south locations
filter(dog_data, (location == "north" |location == "south" ) & sex == "F")
# example : Create a data set from dat_csv called low_read that contains observations where the read score is less than or equal to 50.
low_read <- filter(dat_csv, (read < 50 | read == 50))
head(low_read,3)
# Create a data set from dat_csv called mid_read that contains observations where the read score is greater than 50 but also less than or equal to 60.
mid_read <- filter(dat_csv, (read < 60 | read == 60) & read > 50 )
head(mid_read,3)
## Subsetting variables (columns)
# select()
select(dog_data, id, sex)
select(dog_data, -c(id, sex)) #unselect
#example : Create a data set called high_read_in that is just the id and read variables for observations where read is greater than 60. Create another data set called high_read_out that is all of the other variables besides read (include id in both data sets) for the same observations with read greater than 60.
read60 <- filter(dat_csv, read > 60) # 이게 지금 새로운 행 추출을 통해서 새로운 데이터 셋 만든게 되는 거임
high_read_in<- select(read60, c(id, read))
high_read_out <- select(read60, -c(read))
## Appending observations (appending by rows)
# rbind()
more_dogs <- data.frame(id = c("Jack", "Luna"),
weight=c(38, -99),
sex=c("M", "F"),
location=c("east", "east"))
names(dog_data)
colnames(dog_data)
names(more_dogs)
all_dogs <- rbind(dog_data, more_dogs)
all_dogs
#example: Append low_read and mid_read and call the resulting data set low_and_mid_read. Check in the Environment pane that low_and_mid_read has the correct number of observations.
low_and_mid_read <- rbind(low_read,mid_read)
low_and_mid_read
## Adding data columns by merging on a key variable
# inner_join() , merge() , by= ( to control it )
dog_vax <- data.frame(id = c("Luna", "Duke", "Buddy", "Stella", "Daisy", "Lucy", "Jack", "Bear"),
vaccinated = c(TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE))
dogs <- inner_join(all_dogs, dog_vax)
dogs
#example : Merge high_read_in and high_read_out and call it high_read. Append high_read to low_and_mid_read and call it all_read. Check in the Environment pane that all_read and dat_csv are the same size.
high_read <- inner_join(high_read_in,high_read_out)
all_read <- rbind(high_read,low_and_mid_read)
## Missing values - NA
# blank field
dogs$weight[dogs$weight == -99] <- NA
dogs$weight
# Missing values are contagious
1 + 2 +NA
c(1,2,3,NA) >2
dogs$weight
mean(dogs$weight)
#na.rm = TRUE => it will first remove any NA values from the operation before calculating the result
sum(c(1,2,NA), na.rm = TRUE)
mean(dogs$weight, na.rm = TRUE)
#example :In dat_csv, the variable science contains -99 values to signify missing. How can you identify which rows have -99 values? Convert all of these -99 values to NA. Calculate the mean of science ignoring the missing values.
dat_csv[(dat_csv$science == -99),]
dat_csv$science[dat_csv$science == -99] <- NA
mean(dat_csv$science, na.rm = TRUE)
### Basic Data Analysis
## Descriptive statistics for continuous variables
bloodtest <- data.frame(id = 1:10,
gender = c("female", "male", "female", "female", "female", "male", "male", "female", "male", "female"),
hospital = c("CLH", "MH", "MH", "MH", "CLH", "MH", "MDH", "MDH", "CLH", "MH"),
doc_id = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 3),
insured = c(0, 1, 1, 1, 0, 1, 1, 0, 1, 1),
age = c(23, 45, 37, 49, 51, 55, 56, 37, 26, 40),
test1 = c(47, 67, 41, 65, 60, 52, 68, 37, 44, 44),
test2 = c(46, 57, 47, 65, 62, 51 ,62 ,44 ,46, 61),
test3 = c(49, 73, 50, 64, 77, 57, 75, 55, 62, 55),
test4 = c(61, 61, 51, 71, 56, 57, 61, 46, 46, 46))
mean(bloodtest$age)
median(bloodtest$age)
var(bloodtest$age)
summary(bloodtest$age)
## Correlations
# provide quick assessments of whether two continuous variables are linearly related to one another.
# cor(), depending on variables it can be called as single correlation and correlation matrix.
cor(bloodtest$test1, bloodtest$test2)
scores <- select(bloodtest, test1, test2, test3, test4)
cor(scores)
# example :Create a correlation table of the dat_csv variables read, write, math, science, and socst.
dat_var <- select( dat_csv,read, write, math, science, socst)
cor(dat_var)
## Frequency Tables
# 범주형 변수에는 mean이나 median 등이 의미가 없을 것이다. 그래서 빈도수 테이블을 통해 각각의 범주의 분포를 알아본다.
#table() , prop.table() => 확률로
table(bloodtest$gender)
table(bloodtest$hospital)
prop.table(table(bloodtest$hospital))
## Crosstabs
# two way and multi way frequency tables are used to explore the relationships btw categorical variables
# margin = 1 행, margin = 2 열
my2way <- table (bloodtest$gender, bloodtest$hospital)
my2way
prop.table(my2way, margin = 1)
prop.table(my2way, margin = 2)
# example : Determine the proportion of each socio-economic group (variable ses) within each school type (variable schtyp) in the dat_csv data set.
prop.table(table(dat_csv$ses, dat_csv$schtyp))
## Statistical analysis in R
library(stats)
## Chi-square test of independence
# association btw two categorical variables
# chisq.test() - 범주형, 하나의 확률이 다른 확률에 영향을 주는가?
chisq.test(bloodtest$hospital, bloodtest$insured)
## independent sample t tests
# t.test()
# whether test1 score is associated with gender
t.test(test1 ~ gender, data = bloodtest) #Test1 score does not appear to differ between the genders
# example : Perform a t-test to determine whether math scores are different between genders (variable female) with data set dat_csv.
t.test(math ~ female, data = dat_csv)
## Paired samples t test
# the means of two possibly correlated variables are different
# paired test 에는 ~ 사용하지 않고 paired = TRUE
t.test(bloodtest$test1, bloodtest$test3, paired=TRUE) # The paired t-test suggests that test3 scores are significantly different from test1 scores.
## Linear regression
# expands the simple predictor-outcome model of t-tests by allowing more predictors
# lm()
m1 <- lm()
m1
## model objects and extractor functions
summary(m1)
# example : Peform a linear regression of the outcome read with predictors math, female, and ses using dat_csv. Call the model object m1. Interpret your results.
m1 <- lm(read ~ math + female + ses, data = dat_csv)
m1
coef(m1)
confint(m1)
# cbind joins column vectors into a matrix
cbind(bloodtest$test1, predict(m1), residuals(m1))
## ANOVA
anova(m1)
m2 <- lm(test1 ~ age + gender + hospital, data=bloodtest)
anova(m2,m1)
# Hospital does not appear to improve the fit of the model significantly, so we would typically choose m1, the more parsimoniuous model.
# example
m1 <- lm(read ~ math + female + ses, data = dat_csv)
m3 <- lm(read ~ math + female + ses + prog, data = dat_csv)
anova(m3,m1)
## Regression diagnostics
layout(matrix(c(1,2,3,4),2,2))
plot(m1)
## logistic regression- how variation in a binary outcome can be explained by a set of predictors
### Graphics
## Scatter plots - plot()
layout(1,1,1)
plot(bloodtest$test1, bloodtest$test2)
# grouping variables by making them as factor
bloodtest$gender <- factor(bloodtest$gender)
plot(bloodtest$test1, bloodtest$test2, col = bloodtest$gender) # color by gender
plot(bloodtest$test1, bloodtest$test2,
col = bloodtest$gender,
pch = 17)
# adding labels
plot(bloodtest$test1, bloodtest$test2,
col = bloodtest$gender,
pch = 17,
xlab = "Test 1",
ylab = "Test 2",
main = "Plot of Test1 vs Test2")
# adding a legend
plot(bloodtest$test1, bloodtest$test2,
col = bloodtest$gender,
pch = 17)
legend("topleft", legend = levels(bloodtest$gender), col = bloodtest$gender, pch = 17)
?pch
# example: Create a scatter plot of read (x-axis) vs write (y-axis), using filled square symbols, colored by the variable prog.
dat_csv$prog <- factor(dat_csv$prog)
plot(dat_csv$read, dat_csv$write,
col = dat_csv$prog,
pch = 15)
## Histograms - the distributions of continuous variables
hist(bloodtest$test1, breaks = 2) # 히스토그램 짜르기
## Boxplots - compare the distribution of a continuous variable across the levels of a categorical variable
boxplot(bloodtest$test2 ~ bloodtest$insured)
boxplot(bloodtest$test2 ~ bloodtest$insured,
xlab = "Insured",
ylab = "Test 2",
main = "Boxplots of Test2 by Insurance Status",
col = "lightblue")
## Barplots - to visualize the frequencies of levels of grouping variables, where the height of the bar represents the number of observations falling into that grouping
tab <- table(bloodtest$gender, bloodtest$hospital)
barplot(tab)
# adding a legend of a barplot is easy
barplot(tab,
legend.text = TRUE)
# side-by-side bars
barplot(tab,
legend.text = TRUE,
beside = TRUE,
col = c("lawngreen", "sandybrown"),
xlab = "Hospital",
ylab = "Frequency",
main = "Frequencies of gender by hospital")
# example : Create a bar plot of ses by prog in the data set dat_csv. Use the colors red, green, and blue to color the bars. Add a legend.
dt <- table(dat_csv$ses, dat_csv$prog)
barplot(dt,
legend.text = TRUE,
col = c("red", "green", "blue"))
## Introducing ggplot2 for graphics
library(ggplot2)
## Basic Syntax of a ggplot2 plot
# aspect => aes , shape => geom
# ggplot(dataset, aes(x = xvar, y = yvar)) + geom_function()
ggplot(data = dat_csv , aes(x=math, y=write))+
geom_point()
# best_fit_regression line
ggplot(dat_csv, aes(x=math, y=write)) +
geom_point() +
geom_smooth(method = "lm")
ggplot(dat_csv, aes(x=math, y=write, color = female, fill = female))+
geom_point()+
geom_smooth(method = "lm")
ggplot(dat_csv, aes(x=math, y=write, color = female, fill = female))+
geom_point()+
geom_smooth(method = "lm") +
facet_wrap(~prog) # prog에 따라 그림을 나눔
## Example 2 for ggplot2
#boxplots of math by prog
ggplot(data = dat_csv, aes(x=prog, y=math, fill = schtyp))+
geom_boxplot() +
geom_jitter(width = .05) #see actual data , width 설정함으로써 모아줌
ggplot(data = dat_csv, aes(x=prog, y=math, fill = schtyp))+
geom_boxplot() +
geom_jitter(width = .05) +
theme_dark()
ggplot(data = dat_csv, aes(x=prog, y=math, fill = schtyp))+
geom_boxplot() +
geom_jitter(width = .05)+
theme(panel.background = element_blank(),
panel.grid = element_blank(),
axis.ticks = element_blank(),
text = element_text(family = "serif"))
### ggplot2 exercise
# Use ggplot2 to make a scatter plot of read (x-axis) vs write (y-axis) from data set dat_csv. Color the dots by ses.
# Now add a best fit line with geom_smooth(). Why are there 3 lines?
ggplot(data = dat_csv, aes(x= read, y=write, col = ses))+
geom_point()+
geom_smooth()
### Sharing your work
##Rmarkdown
barplot(HairEyeColor[,,1],
col=c("#4d4d4d", "#bf812d", "#f4a582", "#f6e8c3"),
legend.text=TRUE, xlab="Eye Color",
args.legend=list(title="Hair Color"))
library(shiny)
runExample("01_hello")
runExample("06_tabsets")
# 오전 12:35 끝! shiny 좀 더 공부해보고 싶다 :)
'Data Analysis > R' 카테고리의 다른 글
[R for Data Science] 3 Data visualization (0) | 2022.05.14 |
---|---|
[Machine Learning with R] Managing and Understanding Data Part.2 (0) | 2022.03.24 |
[Machine Learning with R] Managing and Understanding Data Part.1 (0) | 2022.03.24 |
댓글