본문 바로가기

R programming

학생 시험성적 t-test 및 아노바

반응형

데이터를 분석하면서 중요한 것 중 하나는 통계에 대한 이해다.

코드를 작성할 줄 알아도 나온 결과에 대한 이해와 해석이 잘 이루어져야한다는 것을 느끼게 한 분석 결과다.


원 데이터 자료는 캐글에서 가져왔다.

https://www.kaggle.com/spscientist/students-performance-in-exams


코드는 github에도 업로드 해뒀다,,,

통계 공부 열심히 해야겠다 정말,,,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
setwd("c:/practice")
 
student<-read.csv("StudentsPerformance.csv")
str(student)
dim(student)
summary(student)
library(ggplot2)
library(dplyr)
library(agricolae)
#성별에 따른 성적차이가 있는가?
#수학점수 박스플롯
ggplot(data=student, aes(x=gender, y=math.score))+geom_boxplot()+ggtitle("math score")
#읽기점수 박스플롯
ggplot(data=student, aes(x=gender, y=reading.score))+geom_boxplot()+ggtitle("reading score")
#쓰기점수 박스플롯
ggplot(data=student, aes(x=gender, y=writing.score))+geom_boxplot()+ggtitle("writing score")
#수학점수 t.test
student_math<-student %>%
  select(gender,math.score)
 
t.test(data=student_math, math.score~gender) 
#읽기 점수 t.test
student_reading<-student %>%
  select(gender,reading.score)
 
t.test(data=student_reading, reading.score~gender)
# 쓰기 점수 t.test
student_writing<-student %>%
  select(gender,writing.score)
 
t.test(data=student_writing, writing.score~gender)
#인종에 따른 성적 차이가 있는가?
  #인종에 따른 수학점수 비교
aggregate(math.score~race.ethnicity, data=student, mean) 
aggregate(math.score~race.ethnicity, data=student, sd)
summary(aov(math.score~race.ethnicity,data=student))
  #인종에 따른 읽기 점수 비교 
aggregate(reading.score~race.ethnicity, data=student, mean) 
aggregate(reading.score~race.ethnicity, data=student, sd)
summary(aov(reading.score~race.ethnicity,data=student))
  #인종에 따른 쓰기점수 비교
aggregate(writing.score~race.ethnicity, data=student, mean) 
aggregate(writing.score~race.ethnicity, data=student, sd)
summary(aov(writing.score~race.ethnicity,data=student))  
#부모학력에 따른 성적 차이가 있는가?
  #수학점수 비교
aggregate(math.score~parental.level.of.education, data=student, mean) 
aggregate(math.score~parental.level.of.education, data=student, sd)
summary(aov(math.score~parental.level.of.education,data=student))
  #읽기점수 비교
aggregate(reading.score~parental.level.of.education, data=student, mean) 
aggregate(reading.score~parental.level.of.education, data=student, sd)
summary(aov(reading.score~parental.level.of.education,data=student))
  #쓰기점수 비교
aggregate(writing.score~parental.level.of.education, data=student, mean) 
aggregate(writing.score~parental.level.of.education, data=student, sd)
summary(aov(writing.score~parental.level.of.education,data=student))
 
#점심식사 여부에 따라 성적차이가 있는가?
  #수학점수
t.test(data=student, math.score~lunch, var.equal=T)
  #읽기점수
t.test(data=student, reading.score~lunch, var.equal=T)
  #쓰기 점수
t.test(data=student, writing.score~lunch, var.equal=T)
 
#test.preparation course 수강여부에 따라 성적 차이가 있는가?
 
t.test(data=student, math.score~test.preparation.course, var.equal=T)
#읽기점수
t.test(data=student, reading.score~test.preparation.course, var.equal=T)
#쓰기 점수
t.test(data=student, writing.score~test.preparation.course, var.equal=T)
 
#위의변수 중에서 어떤 것이 가장 큰 영향을 끼치는가?
  #수학점수
math.regression<-lm(math.score~gender+race.ethnicity+parental.level.of.education+lunch+test.preparation.course, data=student)
summary(math.regression)
  #읽기점수
reading.regression<-lm(reading.score~gender+race.ethnicity+parental.level.of.education+lunch+test.preparation.course, data=student)
summary(reading.regression)
  #쓰기점수 
writing.regression<-lm(writing.score~gender+race.ethnicity+parental.level.of.education+lunch+test.preparation.course, data=student)
summary(writing.regression)
cs


반응형