-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathR-dplyr
More file actions
123 lines (104 loc) · 4.45 KB
/
R-dplyr
File metadata and controls
123 lines (104 loc) · 4.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
## Basics ##
path <- "C:/Contents/Kaggle/Titanic/titanic/train.csv"
tbl<-read.csv(path)
tbl
library(dplyr)
temp<-select(
.data=tbl,
PassengerId, Survived, Pclass, Name, Sex, Age)
tempmale <- filter(.data = temp, Sex=="male")
tempmale1 <- mutate(.data=tempmale, age_factor = Age*1.05)
tempmale2 <-group_by(.data=tempmale1, Age)
head(tempmale2)
## Descriptive stats ##
> #Frequency table
> table(Cars93$Manufacturer)
Acura Audi BMW Buick Cadillac Chevrolet Chrylser Chrysler
2 2 1 4 2 8 1 2
Dodge Eagle Ford Geo Honda Hyundai Infiniti Lexus
6 2 8 2 3 4 1 2
Lincoln Mazda Mercedes-Benz Mercury Mitsubishi Nissan Oldsmobile Plymouth
2 5 2 2 2 4 4 1
Pontiac Saab Saturn Subaru Suzuki Toyota Volkswagen Volvo
5 1 1 3 1 4 4 2
> min(Cars93$Weight)
[1] 1695
> max(Cars93$Weight)
[1] 4105
> mean(Cars93$Weight)
[1] 3072.903
> median(Cars93$Weight)
[1] 3040
> quantile(Cars93$Weight)
0% 25% 50% 75% 100%
1695 2620 3040 3525 4105
> sd(Cars93$Weight)
[1] 589.8965
> sum(Cars93$Weight)
[1] 285780
> cor(x=Cars93$Price, y=Cars93$Passengers )
[1] 0.05786007
## Visualization ##
library(ISLR) #for the datase
data("Wage")
ggplot(data = Wage, mapping = aes(x = age, y = wage)) + geom_point()
## Statistical Model ##
library("MASS")
> data("iris")
> head(iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
> plot(x=iris$Sepal.Length, y=iris$Sepal.Width, main = "Iris petal length vs width", xlab="Petal length", ylab="Petal width")
> model <- lm(formula = iris$Petal.Length ~ iris$Petal.Width, data = iris)
> plot(x=iris$Petal.Length, y=iris$Petal.Width, main = "Iris petal length vs width", xlab="Petal length", ylab="Petal width")
> summary(model)
Call:
lm(formula = iris$Petal.Length ~ iris$Petal.Width, data = iris)
Residuals:
Min 1Q Median 3Q Max
-1.33542 -0.30347 -0.02955 0.25776 1.39453
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.08356 0.07297 14.85 <2e-16 ***
iris$Petal.Width 2.22994 0.05140 43.39 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4782 on 148 degrees of freedom
Multiple R-squared: 0.9271, Adjusted R-squared: 0.9266
F-statistic: 1882 on 1 and 148 DF, p-value: < 2.2e-16
> lines(x = iris$Petal.Length, y = model$fitted, col = "red", lwd = 3)
> cor(x=iris$Petal.Length, y =iris$Petal.Width)
[1] 0.9628654
> lines(x = iris$Petal.Length, y = model$fitted, col = "red", lwd = 3)
> #predict
> predict(object = model, newdata = data.frame(Petal.Length = c(2,5,7)))
## decision tree classifier ##
> library(MASS)
> data(iris)
> set.seed(42)
> indexes<- sample(x=1:150, size=100)
> print(indexes)
[1] 49 65 74 146 122 150 128 47 24 71 100 89 110 20 114 111 131 41 139 27 109 5 84 34 92 104 3 58 97
[30] 42 142 30 43 15 22 123 8 36 68 86 18 130 126 69 4 98 50 99 88 87 145 26 6 105 2 124 21 96
[59] 115 10 40 129 33 140 73 29 76 9 35 16 107 93 120 138 80 55 90 94 57 121 77 13 53 54 32 60 85
[88] 17 44 83 72 135 118 149 48 136 64 38 1 144
> train = iris[indexes,]
> test =iris[-indexes,]
> library(tree)
> model<- tree(formula= Species ~., data =iris)
> summary(model)
Classification tree:
tree(formula = Species ~ ., data = iris)
Variables actually used in tree construction:
[1] "Petal.Length" "Petal.Width" "Sepal.Length"
Number of terminal nodes: 6
Residual mean deviance: 0.1253 = 18.05 / 144
Misclassification error rate: 0.02667 = 4 / 150
> plot(model)
> text(model)
>