Getting-Started-Machine-Learning/R-dplyr at master · amithsbhat/Getting-Started-Machine-Learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
## Basics ##
path <- "C:/Contents/Kaggle/Titanic/titanic/train.csv"
tbl<-read.csv(path)
tbl

library(dplyr)
temp<-select(
       .data=tbl,
       PassengerId, Survived, Pclass, Name, Sex,  Age)
tempmale <- filter(.data = temp, Sex=="male")
tempmale1 <- mutate(.data=tempmale, age_factor = Age*1.05)
tempmale2 <-group_by(.data=tempmale1, Age)
head(tempmale2)

## Descriptive stats ##

> #Frequency table
> table(Cars93$Manufacturer)

        Acura          Audi           BMW         Buick      Cadillac     Chevrolet      Chrylser      Chrysler
            2             2             1             4             2             8             1             2
        Dodge         Eagle          Ford           Geo         Honda       Hyundai      Infiniti         Lexus
            6             2             8             2             3             4             1             2
      Lincoln         Mazda Mercedes-Benz       Mercury    Mitsubishi        Nissan    Oldsmobile      Plymouth
            2             5             2             2             2             4             4             1
      Pontiac          Saab        Saturn        Subaru        Suzuki        Toyota    Volkswagen         Volvo
            5             1             1             3             1             4             4             2

> min(Cars93$Weight)
[1] 1695
> max(Cars93$Weight)
[1] 4105
> mean(Cars93$Weight)
[1] 3072.903
> median(Cars93$Weight)
[1] 3040
> quantile(Cars93$Weight)
  0%  25%  50%  75% 100%
1695 2620 3040 3525 4105
> sd(Cars93$Weight)
[1] 589.8965
> sum(Cars93$Weight)
[1] 285780
> cor(x=Cars93$Price, y=Cars93$Passengers )
[1] 0.05786007

## Visualization ##

library(ISLR) #for the datase
data("Wage")
ggplot(data = Wage, mapping = aes(x = age, y = wage)) + geom_point()

## Statistical Model ##

 library("MASS")
> data("iris")
> head(iris)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa
> plot(x=iris$Sepal.Length, y=iris$Sepal.Width, main = "Iris petal length vs width", xlab="Petal length", ylab="Petal width")
> model <- lm(formula = iris$Petal.Length ~ iris$Petal.Width, data = iris)
> plot(x=iris$Petal.Length, y=iris$Petal.Width, main = "Iris petal length vs width", xlab="Petal length", ylab="Petal width")
> summary(model)

Call:
lm(formula = iris$Petal.Length ~ iris$Petal.Width, data = iris)

Residuals:
     Min       1Q   Median       3Q      Max
-1.33542 -0.30347 -0.02955  0.25776  1.39453

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)
(Intercept)       1.08356    0.07297   14.85   <2e-16 ***
iris$Petal.Width  2.22994    0.05140   43.39   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4782 on 148 degrees of freedom
Multiple R-squared:  0.9271,	Adjusted R-squared:  0.9266
F-statistic:  1882 on 1 and 148 DF,  p-value: < 2.2e-16

> lines(x = iris$Petal.Length, y = model$fitted, col = "red", lwd = 3)
> cor(x=iris$Petal.Length, y =iris$Petal.Width)
[1] 0.9628654
> lines(x = iris$Petal.Length, y = model$fitted, col = "red", lwd = 3)
> #predict
> predict(object = model, newdata = data.frame(Petal.Length = c(2,5,7)))

## decision tree classifier ##

> library(MASS)
> data(iris)
> set.seed(42)
> indexes<- sample(x=1:150, size=100)
> print(indexes)
  [1]  49  65  74 146 122 150 128  47  24  71 100  89 110  20 114 111 131  41 139  27 109   5  84  34  92 104   3  58  97
 [30]  42 142  30  43  15  22 123   8  36  68  86  18 130 126  69   4  98  50  99  88  87 145  26   6 105   2 124  21  96
 [59] 115  10  40 129  33 140  73  29  76   9  35  16 107  93 120 138  80  55  90  94  57 121  77  13  53  54  32  60  85
 [88]  17  44  83  72 135 118 149  48 136  64  38   1 144
> train = iris[indexes,]
> test =iris[-indexes,]
> library(tree)
> model<- tree(formula= Species ~., data =iris)
> summary(model)

Classification tree:
tree(formula = Species ~ ., data = iris)
Variables actually used in tree construction:
[1] "Petal.Length" "Petal.Width"  "Sepal.Length"
Number of terminal nodes:  6
Residual mean deviance:  0.1253 = 18.05 / 144
Misclassification error rate: 0.02667 = 4 / 150
> plot(model)
> text(model)
>