forked from shokru/mlfactor.github.io
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinterp.html
More file actions
925 lines (858 loc) · 110 KB
/
interp.html
File metadata and controls
925 lines (858 loc) · 110 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
<!DOCTYPE html>
<html lang="" xml:lang="">
<head>
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<title>Chapter 13 Interpretability | Machine Learning for Factor Investing</title>
<meta name="description" content="Chapter 13 Interpretability | Machine Learning for Factor Investing" />
<meta name="generator" content="bookdown 0.21 and GitBook 2.6.7" />
<meta property="og:title" content="Chapter 13 Interpretability | Machine Learning for Factor Investing" />
<meta property="og:type" content="book" />
<meta name="twitter:card" content="summary" />
<meta name="twitter:title" content="Chapter 13 Interpretability | Machine Learning for Factor Investing" />
<meta name="author" content="Guillaume Coqueret and Tony Guida" />
<meta name="date" content="2021-01-08" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black" />
<link rel="prev" href="backtest.html"/>
<link rel="next" href="causality.html"/>
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-clipboard.css" rel="stylesheet" />
<script src="libs/accessible-code-block-0.0.1/empty-anchor.js"></script>
<link href="libs/anchor-sections-1.0/anchor-sections.css" rel="stylesheet" />
<script src="libs/anchor-sections-1.0/anchor-sections.js"></script>
<script src="libs/kePrint-0.0.1/kePrint.js"></script>
<link href="libs/lightable-0.0.1/lightable.css" rel="stylesheet" />
<style type="text/css">
code.sourceCode > span { display: inline-block; line-height: 1.25; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode { white-space: pre; position: relative; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
code.sourceCode { white-space: pre-wrap; }
code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>
</head>
<body>
<div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">
<div class="book-summary">
<nav role="navigation">
<ul class="summary">
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html"><i class="fa fa-check"></i>Preface</a><ul>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#what-this-book-is-not-about"><i class="fa fa-check"></i>What this book is not about</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#the-targeted-audience"><i class="fa fa-check"></i>The targeted audience</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#how-this-book-is-structured"><i class="fa fa-check"></i>How this book is structured</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#companion-website"><i class="fa fa-check"></i>Companion website</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#why-r"><i class="fa fa-check"></i>Why R?</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#coding-instructions"><i class="fa fa-check"></i>Coding instructions</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#acknowledgments"><i class="fa fa-check"></i>Acknowledgments</a></li>
<li class="chapter" data-level="" data-path="preface.html"><a href="preface.html#future-developments"><i class="fa fa-check"></i>Future developments</a></li>
</ul></li>
<li class="part"><span><b>I Introduction</b></span></li>
<li class="chapter" data-level="1" data-path="notdata.html"><a href="notdata.html"><i class="fa fa-check"></i><b>1</b> Notations and data</a><ul>
<li class="chapter" data-level="1.1" data-path="notdata.html"><a href="notdata.html#notations"><i class="fa fa-check"></i><b>1.1</b> Notations</a></li>
<li class="chapter" data-level="1.2" data-path="notdata.html"><a href="notdata.html#dataset"><i class="fa fa-check"></i><b>1.2</b> Dataset</a></li>
</ul></li>
<li class="chapter" data-level="2" data-path="intro.html"><a href="intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
<li class="chapter" data-level="2.1" data-path="intro.html"><a href="intro.html#context"><i class="fa fa-check"></i><b>2.1</b> Context</a></li>
<li class="chapter" data-level="2.2" data-path="intro.html"><a href="intro.html#portfolio-construction-the-workflow"><i class="fa fa-check"></i><b>2.2</b> Portfolio construction: the workflow</a></li>
<li class="chapter" data-level="2.3" data-path="intro.html"><a href="intro.html#machine-learning-is-no-magic-wand"><i class="fa fa-check"></i><b>2.3</b> Machine learning is no magic wand</a></li>
</ul></li>
<li class="chapter" data-level="3" data-path="factor.html"><a href="factor.html"><i class="fa fa-check"></i><b>3</b> Factor investing and asset pricing anomalies</a><ul>
<li class="chapter" data-level="3.1" data-path="factor.html"><a href="factor.html#introduction"><i class="fa fa-check"></i><b>3.1</b> Introduction</a></li>
<li class="chapter" data-level="3.2" data-path="factor.html"><a href="factor.html#detecting-anomalies"><i class="fa fa-check"></i><b>3.2</b> Detecting anomalies</a><ul>
<li class="chapter" data-level="3.2.1" data-path="factor.html"><a href="factor.html#challenges"><i class="fa fa-check"></i><b>3.2.1</b> Challenges</a></li>
<li class="chapter" data-level="3.2.2" data-path="factor.html"><a href="factor.html#simple-portfolio-sorts"><i class="fa fa-check"></i><b>3.2.2</b> Simple portfolio sorts </a></li>
<li class="chapter" data-level="3.2.3" data-path="factor.html"><a href="factor.html#factors"><i class="fa fa-check"></i><b>3.2.3</b> Factors</a></li>
<li class="chapter" data-level="3.2.4" data-path="factor.html"><a href="factor.html#predictive-regressions-sorts-and-p-value-issues"><i class="fa fa-check"></i><b>3.2.4</b> Predictive regressions, sorts, and p-value issues</a></li>
<li class="chapter" data-level="3.2.5" data-path="factor.html"><a href="factor.html#fama-macbeth-regressions"><i class="fa fa-check"></i><b>3.2.5</b> Fama-Macbeth regressions</a></li>
<li class="chapter" data-level="3.2.6" data-path="factor.html"><a href="factor.html#factor-competition"><i class="fa fa-check"></i><b>3.2.6</b> Factor competition</a></li>
<li class="chapter" data-level="3.2.7" data-path="factor.html"><a href="factor.html#advanced-techniques"><i class="fa fa-check"></i><b>3.2.7</b> Advanced techniques</a></li>
</ul></li>
<li class="chapter" data-level="3.3" data-path="factor.html"><a href="factor.html#factors-or-characteristics"><i class="fa fa-check"></i><b>3.3</b> Factors or characteristics?</a></li>
<li class="chapter" data-level="3.4" data-path="factor.html"><a href="factor.html#hot-topics-momentum-timing-and-esg"><i class="fa fa-check"></i><b>3.4</b> Hot topics: momentum, timing and ESG</a><ul>
<li class="chapter" data-level="3.4.1" data-path="factor.html"><a href="factor.html#factor-momentum"><i class="fa fa-check"></i><b>3.4.1</b> Factor momentum</a></li>
<li class="chapter" data-level="3.4.2" data-path="factor.html"><a href="factor.html#factor-timing"><i class="fa fa-check"></i><b>3.4.2</b> Factor timing</a></li>
<li class="chapter" data-level="3.4.3" data-path="factor.html"><a href="factor.html#the-green-factors"><i class="fa fa-check"></i><b>3.4.3</b> The green factors</a></li>
</ul></li>
<li class="chapter" data-level="3.5" data-path="factor.html"><a href="factor.html#the-links-with-machine-learning"><i class="fa fa-check"></i><b>3.5</b> The links with machine learning</a><ul>
<li class="chapter" data-level="3.5.1" data-path="factor.html"><a href="factor.html#a-short-list-of-recent-references"><i class="fa fa-check"></i><b>3.5.1</b> A short list of recent references</a></li>
<li class="chapter" data-level="3.5.2" data-path="factor.html"><a href="factor.html#explicit-connections-with-asset-pricing-models"><i class="fa fa-check"></i><b>3.5.2</b> Explicit connections with asset pricing models</a></li>
</ul></li>
<li class="chapter" data-level="3.6" data-path="factor.html"><a href="factor.html#coding-exercises"><i class="fa fa-check"></i><b>3.6</b> Coding exercises</a></li>
</ul></li>
<li class="chapter" data-level="4" data-path="Data.html"><a href="Data.html"><i class="fa fa-check"></i><b>4</b> Data preprocessing</a><ul>
<li class="chapter" data-level="4.1" data-path="Data.html"><a href="Data.html#know-your-data"><i class="fa fa-check"></i><b>4.1</b> Know your data</a></li>
<li class="chapter" data-level="4.2" data-path="Data.html"><a href="Data.html#missing-data"><i class="fa fa-check"></i><b>4.2</b> Missing data</a></li>
<li class="chapter" data-level="4.3" data-path="Data.html"><a href="Data.html#outlier-detection"><i class="fa fa-check"></i><b>4.3</b> Outlier detection</a></li>
<li class="chapter" data-level="4.4" data-path="Data.html"><a href="Data.html#feateng"><i class="fa fa-check"></i><b>4.4</b> Feature engineering</a><ul>
<li class="chapter" data-level="4.4.1" data-path="Data.html"><a href="Data.html#feature-selection"><i class="fa fa-check"></i><b>4.4.1</b> Feature selection</a></li>
<li class="chapter" data-level="4.4.2" data-path="Data.html"><a href="Data.html#scaling"><i class="fa fa-check"></i><b>4.4.2</b> Scaling the predictors</a></li>
</ul></li>
<li class="chapter" data-level="4.5" data-path="Data.html"><a href="Data.html#labelling"><i class="fa fa-check"></i><b>4.5</b> Labelling</a><ul>
<li class="chapter" data-level="4.5.1" data-path="Data.html"><a href="Data.html#simple-labels"><i class="fa fa-check"></i><b>4.5.1</b> Simple labels</a></li>
<li class="chapter" data-level="4.5.2" data-path="Data.html"><a href="Data.html#categorical-labels"><i class="fa fa-check"></i><b>4.5.2</b> Categorical labels</a></li>
<li class="chapter" data-level="4.5.3" data-path="Data.html"><a href="Data.html#the-triple-barrier-method"><i class="fa fa-check"></i><b>4.5.3</b> The triple barrier method</a></li>
<li class="chapter" data-level="4.5.4" data-path="Data.html"><a href="Data.html#filtering-the-sample"><i class="fa fa-check"></i><b>4.5.4</b> Filtering the sample</a></li>
<li class="chapter" data-level="4.5.5" data-path="Data.html"><a href="Data.html#horizons"><i class="fa fa-check"></i><b>4.5.5</b> Return horizons</a></li>
</ul></li>
<li class="chapter" data-level="4.6" data-path="Data.html"><a href="Data.html#pers"><i class="fa fa-check"></i><b>4.6</b> Handling persistence</a></li>
<li class="chapter" data-level="4.7" data-path="Data.html"><a href="Data.html#extensions"><i class="fa fa-check"></i><b>4.7</b> Extensions</a><ul>
<li class="chapter" data-level="4.7.1" data-path="Data.html"><a href="Data.html#transforming-features"><i class="fa fa-check"></i><b>4.7.1</b> Transforming features</a></li>
<li class="chapter" data-level="4.7.2" data-path="Data.html"><a href="Data.html#macrovar"><i class="fa fa-check"></i><b>4.7.2</b> Macro-economic variables</a></li>
<li class="chapter" data-level="4.7.3" data-path="Data.html"><a href="Data.html#active-learning"><i class="fa fa-check"></i><b>4.7.3</b> Active learning</a></li>
</ul></li>
<li class="chapter" data-level="4.8" data-path="Data.html"><a href="Data.html#additional-code-and-results"><i class="fa fa-check"></i><b>4.8</b> Additional code and results</a><ul>
<li class="chapter" data-level="4.8.1" data-path="Data.html"><a href="Data.html#impact-of-rescaling-graphical-representation"><i class="fa fa-check"></i><b>4.8.1</b> Impact of rescaling: graphical representation</a></li>
<li class="chapter" data-level="4.8.2" data-path="Data.html"><a href="Data.html#impact-of-rescaling-toy-example"><i class="fa fa-check"></i><b>4.8.2</b> Impact of rescaling: toy example</a></li>
</ul></li>
<li class="chapter" data-level="4.9" data-path="Data.html"><a href="Data.html#coding-exercises-1"><i class="fa fa-check"></i><b>4.9</b> Coding exercises</a></li>
</ul></li>
<li class="part"><span><b>II Common supervised algorithms</b></span></li>
<li class="chapter" data-level="5" data-path="lasso.html"><a href="lasso.html"><i class="fa fa-check"></i><b>5</b> Penalized regressions and sparse hedging for minimum variance portfolios</a><ul>
<li class="chapter" data-level="5.1" data-path="lasso.html"><a href="lasso.html#penalized-regressions"><i class="fa fa-check"></i><b>5.1</b> Penalized regressions</a><ul>
<li class="chapter" data-level="5.1.1" data-path="lasso.html"><a href="lasso.html#penreg"><i class="fa fa-check"></i><b>5.1.1</b> Simple regressions</a></li>
<li class="chapter" data-level="5.1.2" data-path="lasso.html"><a href="lasso.html#forms-of-penalizations"><i class="fa fa-check"></i><b>5.1.2</b> Forms of penalizations</a></li>
<li class="chapter" data-level="5.1.3" data-path="lasso.html"><a href="lasso.html#illustrations"><i class="fa fa-check"></i><b>5.1.3</b> Illustrations</a></li>
</ul></li>
<li class="chapter" data-level="5.2" data-path="lasso.html"><a href="lasso.html#sparse-hedging-for-minimum-variance-portfolios"><i class="fa fa-check"></i><b>5.2</b> Sparse hedging for minimum variance portfolios</a><ul>
<li class="chapter" data-level="5.2.1" data-path="lasso.html"><a href="lasso.html#presentation-and-derivations"><i class="fa fa-check"></i><b>5.2.1</b> Presentation and derivations</a></li>
<li class="chapter" data-level="5.2.2" data-path="lasso.html"><a href="lasso.html#sparseex"><i class="fa fa-check"></i><b>5.2.2</b> Example</a></li>
</ul></li>
<li class="chapter" data-level="5.3" data-path="lasso.html"><a href="lasso.html#predictive-regressions"><i class="fa fa-check"></i><b>5.3</b> Predictive regressions</a><ul>
<li class="chapter" data-level="5.3.1" data-path="lasso.html"><a href="lasso.html#literature-review-and-principle"><i class="fa fa-check"></i><b>5.3.1</b> Literature review and principle</a></li>
<li class="chapter" data-level="5.3.2" data-path="lasso.html"><a href="lasso.html#code-and-results"><i class="fa fa-check"></i><b>5.3.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="5.4" data-path="lasso.html"><a href="lasso.html#coding-exercise"><i class="fa fa-check"></i><b>5.4</b> Coding exercise</a></li>
</ul></li>
<li class="chapter" data-level="6" data-path="trees.html"><a href="trees.html"><i class="fa fa-check"></i><b>6</b> Tree-based methods</a><ul>
<li class="chapter" data-level="6.1" data-path="trees.html"><a href="trees.html#simple-trees"><i class="fa fa-check"></i><b>6.1</b> Simple trees</a><ul>
<li class="chapter" data-level="6.1.1" data-path="trees.html"><a href="trees.html#principle"><i class="fa fa-check"></i><b>6.1.1</b> Principle</a></li>
<li class="chapter" data-level="6.1.2" data-path="trees.html"><a href="trees.html#treeclass"><i class="fa fa-check"></i><b>6.1.2</b> Further details on classification</a></li>
<li class="chapter" data-level="6.1.3" data-path="trees.html"><a href="trees.html#pruning-criteria"><i class="fa fa-check"></i><b>6.1.3</b> Pruning criteria</a></li>
<li class="chapter" data-level="6.1.4" data-path="trees.html"><a href="trees.html#code-and-interpretation"><i class="fa fa-check"></i><b>6.1.4</b> Code and interpretation</a></li>
</ul></li>
<li class="chapter" data-level="6.2" data-path="trees.html"><a href="trees.html#random-forests"><i class="fa fa-check"></i><b>6.2</b> Random forests</a><ul>
<li class="chapter" data-level="6.2.1" data-path="trees.html"><a href="trees.html#principle-1"><i class="fa fa-check"></i><b>6.2.1</b> Principle</a></li>
<li class="chapter" data-level="6.2.2" data-path="trees.html"><a href="trees.html#code-and-results-1"><i class="fa fa-check"></i><b>6.2.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="6.3" data-path="trees.html"><a href="trees.html#adaboost"><i class="fa fa-check"></i><b>6.3</b> Boosted trees: Adaboost</a><ul>
<li class="chapter" data-level="6.3.1" data-path="trees.html"><a href="trees.html#methodology"><i class="fa fa-check"></i><b>6.3.1</b> Methodology</a></li>
<li class="chapter" data-level="6.3.2" data-path="trees.html"><a href="trees.html#illustration"><i class="fa fa-check"></i><b>6.3.2</b> Illustration</a></li>
</ul></li>
<li class="chapter" data-level="6.4" data-path="trees.html"><a href="trees.html#boosted-trees-extreme-gradient-boosting"><i class="fa fa-check"></i><b>6.4</b> Boosted trees: extreme gradient boosting</a><ul>
<li class="chapter" data-level="6.4.1" data-path="trees.html"><a href="trees.html#managing-loss"><i class="fa fa-check"></i><b>6.4.1</b> Managing loss</a></li>
<li class="chapter" data-level="6.4.2" data-path="trees.html"><a href="trees.html#penalization"><i class="fa fa-check"></i><b>6.4.2</b> Penalization</a></li>
<li class="chapter" data-level="6.4.3" data-path="trees.html"><a href="trees.html#aggregation"><i class="fa fa-check"></i><b>6.4.3</b> Aggregation</a></li>
<li class="chapter" data-level="6.4.4" data-path="trees.html"><a href="trees.html#tree-structure"><i class="fa fa-check"></i><b>6.4.4</b> Tree structure</a></li>
<li class="chapter" data-level="6.4.5" data-path="trees.html"><a href="trees.html#boostext"><i class="fa fa-check"></i><b>6.4.5</b> Extensions</a></li>
<li class="chapter" data-level="6.4.6" data-path="trees.html"><a href="trees.html#boostcode"><i class="fa fa-check"></i><b>6.4.6</b> Code and results</a></li>
<li class="chapter" data-level="6.4.7" data-path="trees.html"><a href="trees.html#instweight"><i class="fa fa-check"></i><b>6.4.7</b> Instance weighting</a></li>
</ul></li>
<li class="chapter" data-level="6.5" data-path="trees.html"><a href="trees.html#discussion"><i class="fa fa-check"></i><b>6.5</b> Discussion</a></li>
<li class="chapter" data-level="6.6" data-path="trees.html"><a href="trees.html#coding-exercises-2"><i class="fa fa-check"></i><b>6.6</b> Coding exercises</a></li>
</ul></li>
<li class="chapter" data-level="7" data-path="NN.html"><a href="NN.html"><i class="fa fa-check"></i><b>7</b> Neural networks</a><ul>
<li class="chapter" data-level="7.1" data-path="NN.html"><a href="NN.html#the-original-perceptron"><i class="fa fa-check"></i><b>7.1</b> The original perceptron</a></li>
<li class="chapter" data-level="7.2" data-path="NN.html"><a href="NN.html#multilayer-perceptron"><i class="fa fa-check"></i><b>7.2</b> Multilayer perceptron</a><ul>
<li class="chapter" data-level="7.2.1" data-path="NN.html"><a href="NN.html#introduction-and-notations"><i class="fa fa-check"></i><b>7.2.1</b> Introduction and notations</a></li>
<li class="chapter" data-level="7.2.2" data-path="NN.html"><a href="NN.html#universal-approximation"><i class="fa fa-check"></i><b>7.2.2</b> Universal approximation</a></li>
<li class="chapter" data-level="7.2.3" data-path="NN.html"><a href="NN.html#backprop"><i class="fa fa-check"></i><b>7.2.3</b> Learning via back-propagation</a></li>
<li class="chapter" data-level="7.2.4" data-path="NN.html"><a href="NN.html#further-details-on-classification"><i class="fa fa-check"></i><b>7.2.4</b> Further details on classification</a></li>
</ul></li>
<li class="chapter" data-level="7.3" data-path="NN.html"><a href="NN.html#howdeep"><i class="fa fa-check"></i><b>7.3</b> How deep we should go and other practical issues</a><ul>
<li class="chapter" data-level="7.3.1" data-path="NN.html"><a href="NN.html#architectural-choices"><i class="fa fa-check"></i><b>7.3.1</b> Architectural choices</a></li>
<li class="chapter" data-level="7.3.2" data-path="NN.html"><a href="NN.html#frequency-of-weight-updates-and-learning-duration"><i class="fa fa-check"></i><b>7.3.2</b> Frequency of weight updates and learning duration</a></li>
<li class="chapter" data-level="7.3.3" data-path="NN.html"><a href="NN.html#penalizations-and-dropout"><i class="fa fa-check"></i><b>7.3.3</b> Penalizations and dropout</a></li>
</ul></li>
<li class="chapter" data-level="7.4" data-path="NN.html"><a href="NN.html#code-samples-and-comments-for-vanilla-mlp"><i class="fa fa-check"></i><b>7.4</b> Code samples and comments for vanilla MLP</a><ul>
<li class="chapter" data-level="7.4.1" data-path="NN.html"><a href="NN.html#regression-example"><i class="fa fa-check"></i><b>7.4.1</b> Regression example</a></li>
<li class="chapter" data-level="7.4.2" data-path="NN.html"><a href="NN.html#classification-example"><i class="fa fa-check"></i><b>7.4.2</b> Classification example</a></li>
<li class="chapter" data-level="7.4.3" data-path="NN.html"><a href="NN.html#custloss"><i class="fa fa-check"></i><b>7.4.3</b> Custom losses</a></li>
</ul></li>
<li class="chapter" data-level="7.5" data-path="NN.html"><a href="NN.html#recurrent-networks"><i class="fa fa-check"></i><b>7.5</b> Recurrent networks</a><ul>
<li class="chapter" data-level="7.5.1" data-path="NN.html"><a href="NN.html#presentation"><i class="fa fa-check"></i><b>7.5.1</b> Presentation</a></li>
<li class="chapter" data-level="7.5.2" data-path="NN.html"><a href="NN.html#code-and-results-2"><i class="fa fa-check"></i><b>7.5.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="7.6" data-path="NN.html"><a href="NN.html#other-common-architectures"><i class="fa fa-check"></i><b>7.6</b> Other common architectures</a><ul>
<li class="chapter" data-level="7.6.1" data-path="NN.html"><a href="NN.html#generative-aversarial-networks"><i class="fa fa-check"></i><b>7.6.1</b> Generative adversarial networks</a></li>
<li class="chapter" data-level="7.6.2" data-path="NN.html"><a href="NN.html#autoencoders"><i class="fa fa-check"></i><b>7.6.2</b> Autoencoders</a></li>
<li class="chapter" data-level="7.6.3" data-path="NN.html"><a href="NN.html#a-word-on-convolutional-networks"><i class="fa fa-check"></i><b>7.6.3</b> A word on convolutional networks</a></li>
<li class="chapter" data-level="7.6.4" data-path="NN.html"><a href="NN.html#advanced-architectures"><i class="fa fa-check"></i><b>7.6.4</b> Advanced architectures</a></li>
</ul></li>
<li class="chapter" data-level="7.7" data-path="NN.html"><a href="NN.html#coding-exercise-1"><i class="fa fa-check"></i><b>7.7</b> Coding exercise</a></li>
</ul></li>
<li class="chapter" data-level="8" data-path="svm.html"><a href="svm.html"><i class="fa fa-check"></i><b>8</b> Support vector machines</a><ul>
<li class="chapter" data-level="8.1" data-path="svm.html"><a href="svm.html#svm-for-classification"><i class="fa fa-check"></i><b>8.1</b> SVM for classification</a></li>
<li class="chapter" data-level="8.2" data-path="svm.html"><a href="svm.html#svm-for-regression"><i class="fa fa-check"></i><b>8.2</b> SVM for regression</a></li>
<li class="chapter" data-level="8.3" data-path="svm.html"><a href="svm.html#practice"><i class="fa fa-check"></i><b>8.3</b> Practice</a></li>
<li class="chapter" data-level="8.4" data-path="svm.html"><a href="svm.html#coding-exercises-3"><i class="fa fa-check"></i><b>8.4</b> Coding exercises</a></li>
</ul></li>
<li class="chapter" data-level="9" data-path="bayes.html"><a href="bayes.html"><i class="fa fa-check"></i><b>9</b> Bayesian methods</a><ul>
<li class="chapter" data-level="9.1" data-path="bayes.html"><a href="bayes.html#the-bayesian-framework"><i class="fa fa-check"></i><b>9.1</b> The Bayesian framework</a></li>
<li class="chapter" data-level="9.2" data-path="bayes.html"><a href="bayes.html#bayesian-sampling"><i class="fa fa-check"></i><b>9.2</b> Bayesian sampling</a><ul>
<li class="chapter" data-level="9.2.1" data-path="bayes.html"><a href="bayes.html#gibbs-sampling"><i class="fa fa-check"></i><b>9.2.1</b> Gibbs sampling</a></li>
<li class="chapter" data-level="9.2.2" data-path="bayes.html"><a href="bayes.html#metropolis-hastings-sampling"><i class="fa fa-check"></i><b>9.2.2</b> Metropolis-Hastings sampling</a></li>
</ul></li>
<li class="chapter" data-level="9.3" data-path="bayes.html"><a href="bayes.html#bayesian-linear-regression"><i class="fa fa-check"></i><b>9.3</b> Bayesian linear regression</a></li>
<li class="chapter" data-level="9.4" data-path="bayes.html"><a href="bayes.html#naive-bayes-classifier"><i class="fa fa-check"></i><b>9.4</b> Naive Bayes classifier</a></li>
<li class="chapter" data-level="9.5" data-path="bayes.html"><a href="bayes.html#BART"><i class="fa fa-check"></i><b>9.5</b> Bayesian additive trees</a><ul>
<li class="chapter" data-level="9.5.1" data-path="bayes.html"><a href="bayes.html#general-formulation"><i class="fa fa-check"></i><b>9.5.1</b> General formulation</a></li>
<li class="chapter" data-level="9.5.2" data-path="bayes.html"><a href="bayes.html#priors"><i class="fa fa-check"></i><b>9.5.2</b> Priors</a></li>
<li class="chapter" data-level="9.5.3" data-path="bayes.html"><a href="bayes.html#sampling-and-predictions"><i class="fa fa-check"></i><b>9.5.3</b> Sampling and predictions</a></li>
<li class="chapter" data-level="9.5.4" data-path="bayes.html"><a href="bayes.html#code"><i class="fa fa-check"></i><b>9.5.4</b> Code</a></li>
</ul></li>
</ul></li>
<li class="part"><span><b>III From predictions to portfolios</b></span></li>
<li class="chapter" data-level="10" data-path="valtune.html"><a href="valtune.html"><i class="fa fa-check"></i><b>10</b> Validating and tuning</a><ul>
<li class="chapter" data-level="10.1" data-path="valtune.html"><a href="valtune.html#mlmetrics"><i class="fa fa-check"></i><b>10.1</b> Learning metrics</a><ul>
<li class="chapter" data-level="10.1.1" data-path="valtune.html"><a href="valtune.html#regression-analysis"><i class="fa fa-check"></i><b>10.1.1</b> Regression analysis</a></li>
<li class="chapter" data-level="10.1.2" data-path="valtune.html"><a href="valtune.html#classification-analysis"><i class="fa fa-check"></i><b>10.1.2</b> Classification analysis</a></li>
</ul></li>
<li class="chapter" data-level="10.2" data-path="valtune.html"><a href="valtune.html#validation"><i class="fa fa-check"></i><b>10.2</b> Validation</a><ul>
<li class="chapter" data-level="10.2.1" data-path="valtune.html"><a href="valtune.html#the-variance-bias-tradeoff-theory"><i class="fa fa-check"></i><b>10.2.1</b> The variance-bias tradeoff: theory</a></li>
<li class="chapter" data-level="10.2.2" data-path="valtune.html"><a href="valtune.html#the-variance-bias-tradeoff-illustration"><i class="fa fa-check"></i><b>10.2.2</b> The variance-bias tradeoff: illustration</a></li>
<li class="chapter" data-level="10.2.3" data-path="valtune.html"><a href="valtune.html#the-risk-of-overfitting-principle"><i class="fa fa-check"></i><b>10.2.3</b> The risk of overfitting: principle</a></li>
<li class="chapter" data-level="10.2.4" data-path="valtune.html"><a href="valtune.html#the-risk-of-overfitting-some-solutions"><i class="fa fa-check"></i><b>10.2.4</b> The risk of overfitting: some solutions</a></li>
</ul></li>
<li class="chapter" data-level="10.3" data-path="valtune.html"><a href="valtune.html#the-search-for-good-hyperparameters"><i class="fa fa-check"></i><b>10.3</b> The search for good hyperparameters</a><ul>
<li class="chapter" data-level="10.3.1" data-path="valtune.html"><a href="valtune.html#methods"><i class="fa fa-check"></i><b>10.3.1</b> Methods</a></li>
<li class="chapter" data-level="10.3.2" data-path="valtune.html"><a href="valtune.html#example-grid-search"><i class="fa fa-check"></i><b>10.3.2</b> Example: grid search</a></li>
<li class="chapter" data-level="10.3.3" data-path="valtune.html"><a href="valtune.html#example-bayesian-optimization"><i class="fa fa-check"></i><b>10.3.3</b> Example: Bayesian optimization</a></li>
</ul></li>
<li class="chapter" data-level="10.4" data-path="valtune.html"><a href="valtune.html#short-discussion-on-validation-in-backtests"><i class="fa fa-check"></i><b>10.4</b> Short discussion on validation in backtests</a></li>
</ul></li>
<li class="chapter" data-level="11" data-path="ensemble.html"><a href="ensemble.html"><i class="fa fa-check"></i><b>11</b> Ensemble models</a><ul>
<li class="chapter" data-level="11.1" data-path="ensemble.html"><a href="ensemble.html#linear-ensembles"><i class="fa fa-check"></i><b>11.1</b> Linear ensembles</a><ul>
<li class="chapter" data-level="11.1.1" data-path="ensemble.html"><a href="ensemble.html#principles"><i class="fa fa-check"></i><b>11.1.1</b> Principles</a></li>
<li class="chapter" data-level="11.1.2" data-path="ensemble.html"><a href="ensemble.html#example"><i class="fa fa-check"></i><b>11.1.2</b> Example</a></li>
</ul></li>
<li class="chapter" data-level="11.2" data-path="ensemble.html"><a href="ensemble.html#stacked-ensembles"><i class="fa fa-check"></i><b>11.2</b> Stacked ensembles</a><ul>
<li class="chapter" data-level="11.2.1" data-path="ensemble.html"><a href="ensemble.html#two-stage-training"><i class="fa fa-check"></i><b>11.2.1</b> Two-stage training</a></li>
<li class="chapter" data-level="11.2.2" data-path="ensemble.html"><a href="ensemble.html#code-and-results-3"><i class="fa fa-check"></i><b>11.2.2</b> Code and results</a></li>
</ul></li>
<li class="chapter" data-level="11.3" data-path="ensemble.html"><a href="ensemble.html#extensions-1"><i class="fa fa-check"></i><b>11.3</b> Extensions</a><ul>
<li class="chapter" data-level="11.3.1" data-path="ensemble.html"><a href="ensemble.html#exogenous-variables"><i class="fa fa-check"></i><b>11.3.1</b> Exogenous variables</a></li>
<li class="chapter" data-level="11.3.2" data-path="ensemble.html"><a href="ensemble.html#shrinking-inter-model-correlations"><i class="fa fa-check"></i><b>11.3.2</b> Shrinking inter-model correlations</a></li>
</ul></li>
<li class="chapter" data-level="11.4" data-path="ensemble.html"><a href="ensemble.html#exercise"><i class="fa fa-check"></i><b>11.4</b> Exercise</a></li>
</ul></li>
<li class="chapter" data-level="12" data-path="backtest.html"><a href="backtest.html"><i class="fa fa-check"></i><b>12</b> Portfolio backtesting</a><ul>
<li class="chapter" data-level="12.1" data-path="backtest.html"><a href="backtest.html#protocol"><i class="fa fa-check"></i><b>12.1</b> Setting the protocol</a></li>
<li class="chapter" data-level="12.2" data-path="backtest.html"><a href="backtest.html#turning-signals-into-portfolio-weights"><i class="fa fa-check"></i><b>12.2</b> Turning signals into portfolio weights</a></li>
<li class="chapter" data-level="12.3" data-path="backtest.html"><a href="backtest.html#perfmet"><i class="fa fa-check"></i><b>12.3</b> Performance metrics</a><ul>
<li class="chapter" data-level="12.3.1" data-path="backtest.html"><a href="backtest.html#discussion-1"><i class="fa fa-check"></i><b>12.3.1</b> Discussion</a></li>
<li class="chapter" data-level="12.3.2" data-path="backtest.html"><a href="backtest.html#pure-performance-and-risk-indicators"><i class="fa fa-check"></i><b>12.3.2</b> Pure performance and risk indicators</a></li>
<li class="chapter" data-level="12.3.3" data-path="backtest.html"><a href="backtest.html#factor-based-evaluation"><i class="fa fa-check"></i><b>12.3.3</b> Factor-based evaluation</a></li>
<li class="chapter" data-level="12.3.4" data-path="backtest.html"><a href="backtest.html#risk-adjusted-measures"><i class="fa fa-check"></i><b>12.3.4</b> Risk-adjusted measures</a></li>
<li class="chapter" data-level="12.3.5" data-path="backtest.html"><a href="backtest.html#transaction-costs-and-turnover"><i class="fa fa-check"></i><b>12.3.5</b> Transaction costs and turnover</a></li>
</ul></li>
<li class="chapter" data-level="12.4" data-path="backtest.html"><a href="backtest.html#common-errors-and-issues"><i class="fa fa-check"></i><b>12.4</b> Common errors and issues</a><ul>
<li class="chapter" data-level="12.4.1" data-path="backtest.html"><a href="backtest.html#forward-looking-data"><i class="fa fa-check"></i><b>12.4.1</b> Forward looking data</a></li>
<li class="chapter" data-level="12.4.2" data-path="backtest.html"><a href="backtest.html#backov"><i class="fa fa-check"></i><b>12.4.2</b> Backtest overfitting</a></li>
<li class="chapter" data-level="12.4.3" data-path="backtest.html"><a href="backtest.html#simple-safeguards"><i class="fa fa-check"></i><b>12.4.3</b> Simple safeguards</a></li>
</ul></li>
<li class="chapter" data-level="12.5" data-path="backtest.html"><a href="backtest.html#implication-of-non-stationarity-forecasting-is-hard"><i class="fa fa-check"></i><b>12.5</b> Implication of non-stationarity: forecasting is hard</a><ul>
<li class="chapter" data-level="12.5.1" data-path="backtest.html"><a href="backtest.html#general-comments"><i class="fa fa-check"></i><b>12.5.1</b> General comments</a></li>
<li class="chapter" data-level="12.5.2" data-path="backtest.html"><a href="backtest.html#the-no-free-lunch-theorem"><i class="fa fa-check"></i><b>12.5.2</b> The no free lunch theorem</a></li>
</ul></li>
<li class="chapter" data-level="12.6" data-path="backtest.html"><a href="backtest.html#first-example-a-complete-backtest"><i class="fa fa-check"></i><b>12.6</b> First example: a complete backtest</a></li>
<li class="chapter" data-level="12.7" data-path="backtest.html"><a href="backtest.html#second-example-backtest-overfitting"><i class="fa fa-check"></i><b>12.7</b> Second example: backtest overfitting</a></li>
<li class="chapter" data-level="12.8" data-path="backtest.html"><a href="backtest.html#coding-exercises-4"><i class="fa fa-check"></i><b>12.8</b> Coding exercises</a></li>
</ul></li>
<li class="part"><span><b>IV Further important topics</b></span></li>
<li class="chapter" data-level="13" data-path="interp.html"><a href="interp.html"><i class="fa fa-check"></i><b>13</b> Interpretability</a><ul>
<li class="chapter" data-level="13.1" data-path="interp.html"><a href="interp.html#global-interpretations"><i class="fa fa-check"></i><b>13.1</b> Global interpretations</a><ul>
<li class="chapter" data-level="13.1.1" data-path="interp.html"><a href="interp.html#surr"><i class="fa fa-check"></i><b>13.1.1</b> Simple models as surrogates</a></li>
<li class="chapter" data-level="13.1.2" data-path="interp.html"><a href="interp.html#variable-importance"><i class="fa fa-check"></i><b>13.1.2</b> Variable importance (tree-based)</a></li>
<li class="chapter" data-level="13.1.3" data-path="interp.html"><a href="interp.html#variable-importance-agnostic"><i class="fa fa-check"></i><b>13.1.3</b> Variable importance (agnostic)</a></li>
<li class="chapter" data-level="13.1.4" data-path="interp.html"><a href="interp.html#partial-dependence-plot"><i class="fa fa-check"></i><b>13.1.4</b> Partial dependence plot</a></li>
</ul></li>
<li class="chapter" data-level="13.2" data-path="interp.html"><a href="interp.html#local-interpretations"><i class="fa fa-check"></i><b>13.2</b> Local interpretations</a><ul>
<li class="chapter" data-level="13.2.1" data-path="interp.html"><a href="interp.html#lime"><i class="fa fa-check"></i><b>13.2.1</b> LIME</a></li>
<li class="chapter" data-level="13.2.2" data-path="interp.html"><a href="interp.html#shapley-values"><i class="fa fa-check"></i><b>13.2.2</b> Shapley values</a></li>
<li class="chapter" data-level="13.2.3" data-path="interp.html"><a href="interp.html#breakdown"><i class="fa fa-check"></i><b>13.2.3</b> Breakdown</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="14" data-path="causality.html"><a href="causality.html"><i class="fa fa-check"></i><b>14</b> Two key concepts: causality and non-stationarity</a><ul>
<li class="chapter" data-level="14.1" data-path="causality.html"><a href="causality.html#causality-1"><i class="fa fa-check"></i><b>14.1</b> Causality</a><ul>
<li class="chapter" data-level="14.1.1" data-path="causality.html"><a href="causality.html#granger"><i class="fa fa-check"></i><b>14.1.1</b> Granger causality</a></li>
<li class="chapter" data-level="14.1.2" data-path="causality.html"><a href="causality.html#causal-additive-models"><i class="fa fa-check"></i><b>14.1.2</b> Causal additive models</a></li>
<li class="chapter" data-level="14.1.3" data-path="causality.html"><a href="causality.html#structural-time-series-models"><i class="fa fa-check"></i><b>14.1.3</b> Structural time series models</a></li>
</ul></li>
<li class="chapter" data-level="14.2" data-path="causality.html"><a href="causality.html#nonstat"><i class="fa fa-check"></i><b>14.2</b> Dealing with changing environments</a><ul>
<li class="chapter" data-level="14.2.1" data-path="causality.html"><a href="causality.html#non-stationarity-yet-another-illustration"><i class="fa fa-check"></i><b>14.2.1</b> Non-stationarity: yet another illustration</a></li>
<li class="chapter" data-level="14.2.2" data-path="causality.html"><a href="causality.html#online-learning"><i class="fa fa-check"></i><b>14.2.2</b> Online learning</a></li>
<li class="chapter" data-level="14.2.3" data-path="causality.html"><a href="causality.html#homogeneous-transfer-learning"><i class="fa fa-check"></i><b>14.2.3</b> Homogeneous transfer learning</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="15" data-path="unsup.html"><a href="unsup.html"><i class="fa fa-check"></i><b>15</b> Unsupervised learning</a><ul>
<li class="chapter" data-level="15.1" data-path="unsup.html"><a href="unsup.html#corpred"><i class="fa fa-check"></i><b>15.1</b> The problem with correlated predictors</a></li>
<li class="chapter" data-level="15.2" data-path="unsup.html"><a href="unsup.html#principal-component-analysis-and-autoencoders"><i class="fa fa-check"></i><b>15.2</b> Principal component analysis and autoencoders</a><ul>
<li class="chapter" data-level="15.2.1" data-path="unsup.html"><a href="unsup.html#a-bit-of-algebra"><i class="fa fa-check"></i><b>15.2.1</b> A bit of algebra</a></li>
<li class="chapter" data-level="15.2.2" data-path="unsup.html"><a href="unsup.html#pca"><i class="fa fa-check"></i><b>15.2.2</b> PCA</a></li>
<li class="chapter" data-level="15.2.3" data-path="unsup.html"><a href="unsup.html#ae"><i class="fa fa-check"></i><b>15.2.3</b> Autoencoders</a></li>
<li class="chapter" data-level="15.2.4" data-path="unsup.html"><a href="unsup.html#application"><i class="fa fa-check"></i><b>15.2.4</b> Application</a></li>
</ul></li>
<li class="chapter" data-level="15.3" data-path="unsup.html"><a href="unsup.html#clustering-via-k-means"><i class="fa fa-check"></i><b>15.3</b> Clustering via k-means</a></li>
<li class="chapter" data-level="15.4" data-path="unsup.html"><a href="unsup.html#nearest-neighbors"><i class="fa fa-check"></i><b>15.4</b> Nearest neighbors</a></li>
<li class="chapter" data-level="15.5" data-path="unsup.html"><a href="unsup.html#coding-exercise-2"><i class="fa fa-check"></i><b>15.5</b> Coding exercise</a></li>
</ul></li>
<li class="chapter" data-level="16" data-path="RL.html"><a href="RL.html"><i class="fa fa-check"></i><b>16</b> Reinforcement learning</a><ul>
<li class="chapter" data-level="16.1" data-path="RL.html"><a href="RL.html#theoretical-layout"><i class="fa fa-check"></i><b>16.1</b> Theoretical layout</a><ul>
<li class="chapter" data-level="16.1.1" data-path="RL.html"><a href="RL.html#general-framework"><i class="fa fa-check"></i><b>16.1.1</b> General framework</a></li>
<li class="chapter" data-level="16.1.2" data-path="RL.html"><a href="RL.html#q-learning"><i class="fa fa-check"></i><b>16.1.2</b> Q-learning</a></li>
<li class="chapter" data-level="16.1.3" data-path="RL.html"><a href="RL.html#sarsa"><i class="fa fa-check"></i><b>16.1.3</b> SARSA</a></li>
</ul></li>
<li class="chapter" data-level="16.2" data-path="RL.html"><a href="RL.html#the-curse-of-dimensionality"><i class="fa fa-check"></i><b>16.2</b> The curse of dimensionality</a></li>
<li class="chapter" data-level="16.3" data-path="RL.html"><a href="RL.html#policy-gradient"><i class="fa fa-check"></i><b>16.3</b> Policy gradient</a><ul>
<li class="chapter" data-level="16.3.1" data-path="RL.html"><a href="RL.html#principle-2"><i class="fa fa-check"></i><b>16.3.1</b> Principle</a></li>
<li class="chapter" data-level="16.3.2" data-path="RL.html"><a href="RL.html#extensions-2"><i class="fa fa-check"></i><b>16.3.2</b> Extensions</a></li>
</ul></li>
<li class="chapter" data-level="16.4" data-path="RL.html"><a href="RL.html#simple-examples"><i class="fa fa-check"></i><b>16.4</b> Simple examples</a><ul>
<li class="chapter" data-level="16.4.1" data-path="RL.html"><a href="RL.html#q-learning-with-simulations"><i class="fa fa-check"></i><b>16.4.1</b> Q-learning with simulations</a></li>
<li class="chapter" data-level="16.4.2" data-path="RL.html"><a href="RL.html#RLemp2"><i class="fa fa-check"></i><b>16.4.2</b> Q-learning with market data</a></li>
</ul></li>
<li class="chapter" data-level="16.5" data-path="RL.html"><a href="RL.html#concluding-remarks"><i class="fa fa-check"></i><b>16.5</b> Concluding remarks</a></li>
<li class="chapter" data-level="16.6" data-path="RL.html"><a href="RL.html#exercises"><i class="fa fa-check"></i><b>16.6</b> Exercises</a></li>
</ul></li>
<li class="part"><span><b>V Appendix</b></span></li>
<li class="chapter" data-level="17" data-path="data-description.html"><a href="data-description.html"><i class="fa fa-check"></i><b>17</b> Data description</a></li>
<li class="chapter" data-level="18" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html"><i class="fa fa-check"></i><b>18</b> Solutions to exercises</a><ul>
<li class="chapter" data-level="18.1" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-3"><i class="fa fa-check"></i><b>18.1</b> Chapter 3</a></li>
<li class="chapter" data-level="18.2" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-4"><i class="fa fa-check"></i><b>18.2</b> Chapter 4</a></li>
<li class="chapter" data-level="18.3" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-5"><i class="fa fa-check"></i><b>18.3</b> Chapter 5</a></li>
<li class="chapter" data-level="18.4" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-6"><i class="fa fa-check"></i><b>18.4</b> Chapter 6</a></li>
<li class="chapter" data-level="18.5" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-7-the-autoencoder-model"><i class="fa fa-check"></i><b>18.5</b> Chapter 7: the autoencoder model</a></li>
<li class="chapter" data-level="18.6" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-8"><i class="fa fa-check"></i><b>18.6</b> Chapter 8</a></li>
<li class="chapter" data-level="18.7" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-11-ensemble-neural-network"><i class="fa fa-check"></i><b>18.7</b> Chapter 11: ensemble neural network</a></li>
<li class="chapter" data-level="18.8" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-12"><i class="fa fa-check"></i><b>18.8</b> Chapter 12</a><ul>
<li class="chapter" data-level="18.8.1" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#ew-portfolios-with-the-tidyverse"><i class="fa fa-check"></i><b>18.8.1</b> EW portfolios with the tidyverse</a></li>
<li class="chapter" data-level="18.8.2" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#advanced-weighting-function"><i class="fa fa-check"></i><b>18.8.2</b> Advanced weighting function</a></li>
<li class="chapter" data-level="18.8.3" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#functional-programming-in-the-backtest"><i class="fa fa-check"></i><b>18.8.3</b> Functional programming in the backtest</a></li>
</ul></li>
<li class="chapter" data-level="18.9" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-15"><i class="fa fa-check"></i><b>18.9</b> Chapter 15</a></li>
<li class="chapter" data-level="18.10" data-path="solutions-to-exercises.html"><a href="solutions-to-exercises.html#chapter-16"><i class="fa fa-check"></i><b>18.10</b> Chapter 16</a></li>
</ul></li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Machine Learning for Factor Investing</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<section class="normal" id="section-">
<div id="interp" class="section level1">
<h1><span class="header-section-number">Chapter 13</span> Interpretability</h1>
<p>
This chapter is dedicated to the techniques that help understand the way models process inputs into outputs. A recent book (<span class="citation">Molnar (<a href="#ref-molnar2019interpretable" role="doc-biblioref">2019</a>)</span> available at <a href="https://christophm.github.io/interpretable-ml-book/" class="uri">https://christophm.github.io/interpretable-ml-book/</a>) is entirely devoted to this topic and we highly recommend to have a look at it. The survey of <span class="citation">Belle and Papantonis (<a href="#ref-belle2020principles" role="doc-biblioref">2020</a>)</span> is also worthwhile. Another more introductory and less technical reference is <span class="citation">Hall and Gill (<a href="#ref-hall2019introduction" role="doc-biblioref">2019</a>)</span>.
Obviously, in this chapter, we will adopt a tone which is factor-investing orientated and discuss examples related to ML models trained on a financial dataset.</p>
<p>Quantitative tools that aim for interpretability of ML models are required to satisfy two simple conditions:</p>
<ol style="list-style-type: decimal">
<li>That they provide information about the model.<br />
</li>
<li>That they are highly comprehensible.</li>
</ol>
<p>Often, these tools generate graphical outputs which are easy to read and yield immediate conclusions.</p>
<p>In attempts to white-box complex machine learning models, one dichotomy stands out:</p>
<ul>
<li><strong>Global models</strong> seek to determine the relative role of features in the construction of the predictions once the model has been trained. This is done at the global level, so that the patterns that are shown in the interpretation hold <em>on average</em> over the whole training set.<br />
</li>
<li><strong>Local models</strong> aim to characterize how the model behaves around one particular instance by considering small variations around this instance. The way these variations are processed by the original model allows to simplify it by approximating it, e.g., in a linear fashion. This approximation can for example determine the sign and magnitude of the impact of each relevant feature in the vicinity of the original instance.</li>
</ul>
<p><span class="citation">Molnar (<a href="#ref-molnar2019interpretable" role="doc-biblioref">2019</a>)</span> proposes another classification of interpretability solutions by splitting interpretations that depend on one particular model (e.g., linear regression or decision tree) versus the interpretations that can be obtained for any kind of model. In the sequel, we present the methods according to the global versus local dichotomy.</p>
<div id="global-interpretations" class="section level2">
<h2><span class="header-section-number">13.1</span> Global interpretations</h2>
<div id="surr" class="section level3">
<h3><span class="header-section-number">13.1.1</span> Simple models as surrogates</h3>
<p>
Let us start with the simplest example of all. In a linear model,
<span class="math display">\[y_i=\alpha+\sum_{k=1}^K\beta_kx_i^k+\epsilon_i,\]</span>
the following elements are usually extracted from the estimation of the <span class="math inline">\(\beta_k\)</span>:</p>
<ul>
<li>the <span class="math inline">\(R^2\)</span>, which appreciates the <strong>global fit</strong> of the model (possibly penalized to prevent overfitting with many regressors). The <span class="math inline">\(R^2\)</span> is usually computed in-sample;<br />
</li>
<li>the sign of the estimates <span class="math inline">\(\hat{\beta}_k\)</span>, which indicates the <strong>direction</strong> of the impact of each feature <span class="math inline">\(x^k\)</span> on <span class="math inline">\(y\)</span>;</li>
<li>the <span class="math inline">\(t\)</span>-statistics <span class="math inline">\(t_{\hat{\beta_k}}\)</span>, which evaluate the <strong>magnitude</strong> of this impact: regardless of its direction, large statistics in absolute value reveal prominent variables. Often, the <span class="math inline">\(t\)</span>-statistics are translated into <span class="math inline">\(p\)</span>-values which are computed under some suitable distributional assumptions.</li>
</ul>
<p>The last two indicators are useful because they inform the user on which features matter the most and on the sign of the effect of each predictor. This gives a simplified view of how the model processes the features into the output. Most tools that aim to explain black boxes follow the same principles.</p>
<p>Decision trees, because they are easy to picture, are also great models for interpretability. Thanks to this favorable feature, they are target benchmarks for simple models. Recently, <span class="citation">Vidal, Pacheco, and Schiffer (<a href="#ref-vidal2020born" role="doc-biblioref">2020</a>)</span> propose a method to reduce an ensemble of trees into a unique tree. The aim is to propose a simpler model that behaves exactly like the complex one.</p>
<p>More generally, it is an intuitive idea to resort to simple models to proxy more complex algorithms. One simple way to do so is to build so-called <strong>surrogate</strong> models. The process is simple: </p>
<ol style="list-style-type: decimal">
<li>train the original model <span class="math inline">\(f\)</span> on features <span class="math inline">\(\textbf{X}\)</span> and labels <span class="math inline">\(\textbf{y}\)</span>;<br />
</li>
<li>train a simpler model <span class="math inline">\(g\)</span> to explain the predictions of the trained model <span class="math inline">\(\hat{f}\)</span> given the features <span class="math inline">\(\textbf{X}\)</span>:
<span class="math display">\[\hat{f}(\textbf{X})=g(\textbf{X})+\textbf{error}\]</span></li>
</ol>
<p>The estimated model <span class="math inline">\(\hat{g}\)</span> explains how the initial model <span class="math inline">\(\hat{f}\)</span> maps the features into the labels. To illustrate this, we use the <em>iml</em> package (see <span class="citation">Molnar, Casalicchio, and Bischl (<a href="#ref-molnar2018iml" role="doc-biblioref">2018</a>)</span>). The simpler model is a tree with a depth of two.</p>
<div class="sourceCode" id="cb200"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb200-1"><a href="interp.html#cb200-1"></a><span class="kw">library</span>(iml)</span>
<span id="cb200-2"><a href="interp.html#cb200-2"></a>mod <-<span class="st"> </span>Predictor<span class="op">$</span><span class="kw">new</span>(fit_RF, </span>
<span id="cb200-3"><a href="interp.html#cb200-3"></a> <span class="dt">data =</span> training_sample <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features)) </span>
<span id="cb200-4"><a href="interp.html#cb200-4"></a>dt <-<span class="st"> </span>TreeSurrogate<span class="op">$</span><span class="kw">new</span>(mod, <span class="dt">maxdepth =</span> <span class="dv">2</span>)</span>
<span id="cb200-5"><a href="interp.html#cb200-5"></a><span class="kw">plot</span>(dt)</span></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:imlsurr"></span>
<img src="ML_factor_files/figure-html/imlsurr-1.png" alt="Example of surrogate tree." width="500px" />
<p class="caption">
FIGURE 13.1: Example of surrogate tree.
</p>
</div>
<p>The representation of the tree is different, compared to those seen in Chapter <a href="trees.html#trees">6</a>. Indeed, the four possible outcomes (determined by the conditions in the top lines) no longer yield a simple value (average of the label), but more information is given, in the form of a box plot (including the interquartile range and outliers). In the above representation, it is the top right cluster that seems to have the highest rewards, with especially many upward outliers. This cluster consists of small firms with volatile past returns.</p>
</div>
<div id="variable-importance" class="section level3">
<h3><span class="header-section-number">13.1.2</span> Variable importance (tree-based)</h3>
<p>
One incredibly favorable feature of simple decision trees is their interpretability. Their visual representation is clear and straightforward. Just like regressions (which are another building block in ML), simple trees are easy to comprehend and do not suffer from the black-box rebuke that is often associated with more sophisticated tools.</p>
<p>Indeed, both random forests and boosted trees fail to provide perfectly accurate accounts of what is happening inside the engine. In contrast, it is possible to compute the aggregate share (or importance) of each feature in the determination of the structure of the tree once it has been trained.</p>
<p>After training, it is possible to compute, at each node <span class="math inline">\(n\)</span> the gain <span class="math inline">\(G(n)\)</span> obtained by the subsequent split if there are any, i.e., if the node is not a terminal leaf. It is also easy to determine which variable is chosen to perform the split, hence we write <span class="math inline">\(\mathcal{N}_k\)</span> the set of nodes for which feature <span class="math inline">\(k\)</span> is chosen for the partition. Then, the global importance of each feature is given by
<span class="math display">\[I(k)=\sum_{n\in \mathcal{N}_k}G(n),\]</span>
and it is often rescaled so that the sum of <span class="math inline">\(I(k)\)</span> across all <span class="math inline">\(k\)</span> is equal to one. In this case, <span class="math inline">\(I(k)\)</span> measures the relative contribution of feature <span class="math inline">\(k\)</span> in the reduction of loss during the training. A variable with high importance will have a greater impact on predictions. Generally, these variables are those that are located close to the root of the tree.</p>
<p>Below, we take a look at the results obtained from the tree-based models trained in Chapter <a href="trees.html#trees">6</a>. We start by recycling the output from the three regression models we used. Notice that each fitted output has its own structure and importance vectors have different names.</p>
<div class="sourceCode" id="cb201"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb201-1"><a href="interp.html#cb201-1"></a>tree_VI <-<span class="st"> </span>fit_tree<span class="op">$</span>variable.importance <span class="op">%>%</span><span class="st"> </span><span class="co"># VI from tree model</span></span>
<span id="cb201-2"><a href="interp.html#cb201-2"></a><span class="st"> </span><span class="kw">as_tibble</span>(<span class="dt">rownames =</span> <span class="ot">NA</span>) <span class="op">%>%</span><span class="st"> </span><span class="co"># Transform in tibble </span></span>
<span id="cb201-3"><a href="interp.html#cb201-3"></a><span class="st"> </span><span class="kw">rownames_to_column</span>(<span class="st">"Feature"</span>) <span class="co"># Add feature column</span></span>
<span id="cb201-4"><a href="interp.html#cb201-4"></a>RF_VI <-<span class="st"> </span>fit_RF<span class="op">$</span>importance <span class="op">%>%</span><span class="st"> </span><span class="co"># VI from random forest</span></span>
<span id="cb201-5"><a href="interp.html#cb201-5"></a><span class="st"> </span><span class="kw">as_tibble</span>(<span class="dt">rownames =</span> <span class="ot">NA</span>) <span class="op">%>%</span><span class="st"> </span><span class="co"># Transform in tibble </span></span>
<span id="cb201-6"><a href="interp.html#cb201-6"></a><span class="st"> </span><span class="kw">rownames_to_column</span>(<span class="st">"Feature"</span>) <span class="co"># Add feature column</span></span>
<span id="cb201-7"><a href="interp.html#cb201-7"></a>XGB_VI <-<span class="st"> </span><span class="kw">xgb.importance</span>(<span class="dt">model =</span> fit_xgb)[,<span class="dv">1</span><span class="op">:</span><span class="dv">2</span>] <span class="co"># VI from boosted trees</span></span>
<span id="cb201-8"><a href="interp.html#cb201-8"></a>VI_trees <-<span class="st"> </span>tree_VI <span class="op">%>%</span><span class="st"> </span><span class="kw">left_join</span>(RF_VI) <span class="op">%>%</span><span class="st"> </span><span class="kw">left_join</span>(XGB_VI) <span class="co"># Aggregate the VIs</span></span>
<span id="cb201-9"><a href="interp.html#cb201-9"></a><span class="kw">colnames</span>(VI_trees)[<span class="dv">2</span><span class="op">:</span><span class="dv">4</span>] <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Tree"</span>, <span class="st">"RF"</span>, <span class="st">"XGB"</span>) <span class="co"># New column names</span></span>
<span id="cb201-10"><a href="interp.html#cb201-10"></a>norm_<span class="dv">1</span> <-<span class="st"> </span><span class="cf">function</span>(x){<span class="kw">return</span>(x <span class="op">/</span><span class="st"> </span><span class="kw">sum</span>(x))} <span class="co"># Normalizing function</span></span>
<span id="cb201-11"><a href="interp.html#cb201-11"></a>VI_trees <span class="op">%>%</span><span class="st"> </span>na.omit <span class="op">%>%</span><span class="st"> </span><span class="kw">mutate_if</span>(is.numeric, norm_<span class="dv">1</span>) <span class="op">%>%</span><span class="st"> </span><span class="co"># Plotting sequence</span></span>
<span id="cb201-12"><a href="interp.html#cb201-12"></a><span class="st"> </span><span class="kw">gather</span>(<span class="dt">key =</span> model, <span class="dt">value =</span> value, <span class="op">-</span>Feature) <span class="op">%>%</span></span>
<span id="cb201-13"><a href="interp.html#cb201-13"></a><span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> Feature, <span class="dt">y =</span> value, <span class="dt">fill =</span> model)) <span class="op">+</span><span class="st"> </span><span class="kw">geom_col</span>(<span class="dt">position =</span> <span class="st">"dodge"</span>) <span class="op">+</span></span>
<span id="cb201-14"><a href="interp.html#cb201-14"></a><span class="st"> </span><span class="kw">theme</span>(<span class="dt">axis.text.x =</span> <span class="kw">element_text</span>(<span class="dt">angle =</span> <span class="dv">35</span>, <span class="dt">hjust =</span> <span class="dv">1</span>))</span></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:VItrees"></span>
<img src="ML_factor_files/figure-html/VItrees-1.png" alt="Variable importance for tree-based models." width="500px" />
<p class="caption">
FIGURE 13.2: Variable importance for tree-based models.
</p>
</div>
<p>In the above code, tibbles are like dataframes (they are the v2.0 of dataframes, so to speak).
Given the way the graph is coded, Figure <a href="interp.html#fig:VItrees">13.2</a> is in fact misleading. Indeed, by construction, the simple tree model only has a small number of features with nonzero importance: in the above graph, there are only 3: capitalization, price-to-book and volatility. In contrast, because random forest and boosted trees are much more complex, they give some importance to many predictors. The graph shows the variables related to the simple tree model only. For scale reasons, the normalization is performed <em>after</em> the subset of features is chosen. We preferred to limit the number of features shown on the graph for obvious readability concerns.</p>
<p>There are differences in the way the models rely on the features. For instance, the most important feature changes from a model to the other: the simple tree model gives the most importance to the price-to-book ratio, while the random forest bets more on volatility and boosted trees give more weight to capitalization.</p>
<p>One defining property of random forests is that they give a chance to all features. Indeed, by randomizing the choice of predictors, each individual exogenous variable has a shot at explaining the label. Along with boosted trees, the allocation of importance is more balanced across predictors, compared to the simple tree which puts most of its eggs in just a few baskets.</p>
</div>
<div id="variable-importance-agnostic" class="section level3">
<h3><span class="header-section-number">13.1.3</span> Variable importance (agnostic)</h3>
<p>
The idea of quantifying the importance of each feature in the learning process can be extended to nontree-based models. We refer to the papers mentioned in the study by <span class="citation">Fisher, Rudin, and Dominici (<a href="#ref-fisher2018all" role="doc-biblioref">2019</a>)</span> for more information on this stream of the literature. The premise is the same as above: the aim is to quantify to what extent one feature contributes to the learning process.</p>
<p>One way to track the added value of one particular feature is to look at what happens if its values inside the training set are entirely shuffled. If the original feature plays an important role in the explanation of the dependent variable, then the shuffled version of the feature will lead to a much higher loss.</p>
<p>The baseline method to assess feature importance in the general case is the following:</p>
<ol style="list-style-type: decimal">
<li>Train the model on the original data and compute the associated loss <span class="math inline">\(l^*\)</span>.<br />
</li>
<li>For each feature <span class="math inline">\(k\)</span>, create a new training dataset in which the feature’s values are randomly permuted. Then, evaluate the loss <span class="math inline">\(l_k\)</span> of the model based on this altered sample.<br />
</li>
<li>Rank the variable importance of each feature, computed as a difference <span class="math inline">\(\text{VI}_k=l_k-l^*\)</span> or a ratio <span class="math inline">\(\text{VI}_k=l_k/l^*\)</span>.</li>
</ol>
<p>Whether to compute the losses on the training set or the testing set is an open question and remains to the appreciation of the analyst.
The above procedure is of course random and can be repeated so that the importances are averaged over several trials: this improves the stability of the results. This algorithm is implemented in the FeatureImp() function of the <em>iml</em> R package developed by the author of <span class="citation">Molnar (<a href="#ref-molnar2019interpretable" role="doc-biblioref">2019</a>)</span>. We also recommend the <em>vip</em> package, see <span class="citation">Greenwell and Boehmke (<a href="#ref-greenwell2020variable" role="doc-biblioref">n.d.</a>)</span>.<br />
Below, we implement this algorithm manually so to speak for the features appearing in Figure <a href="interp.html#fig:VItrees">13.2</a>. We test this approach on ridge regressions and recycle the variables used in Chapter <a href="lasso.html#lasso">5</a>. We start by the first step: computing the loss on the original training sample.</p>
<div class="sourceCode" id="cb202"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb202-1"><a href="interp.html#cb202-1"></a>fit_ridge_<span class="dv">0</span> <-<span class="st"> </span><span class="kw">glmnet</span>(x_penalized_train, y_penalized_train, <span class="co"># Trained model</span></span>
<span id="cb202-2"><a href="interp.html#cb202-2"></a> <span class="dt">alpha =</span> <span class="dv">0</span>, <span class="dt">lambda =</span> <span class="fl">0.01</span>) </span>
<span id="cb202-3"><a href="interp.html#cb202-3"></a>l_star <-<span class="st"> </span><span class="kw">mean</span>((y_penalized_train<span class="op">-</span><span class="kw">predict</span>(fit_ridge_<span class="dv">0</span>, x_penalized_train))<span class="op">^</span><span class="dv">2</span>) <span class="co"># Loss</span></span></code></pre></div>
<p>Next, we evaluate the loss when each of the predictors has been sequentially shuffled. To reduce computation time, we only make one round of shuffling.</p>
<div class="sourceCode" id="cb203"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb203-1"><a href="interp.html#cb203-1"></a>l <-<span class="st"> </span><span class="kw">c</span>() <span class="co"># Initialize</span></span>
<span id="cb203-2"><a href="interp.html#cb203-2"></a><span class="cf">for</span>(i <span class="cf">in</span> <span class="dv">1</span><span class="op">:</span><span class="kw">nrow</span>(VI_trees)){ <span class="co"># Loop on the features</span></span>
<span id="cb203-3"><a href="interp.html#cb203-3"></a> feat_name <-<span class="st"> </span><span class="kw">as.character</span>(VI_trees[i,<span class="dv">1</span>])</span>
<span id="cb203-4"><a href="interp.html#cb203-4"></a> temp_data <-<span class="st"> </span>training_sample <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features) <span class="co"># Temp feature matrix</span></span>
<span id="cb203-5"><a href="interp.html#cb203-5"></a> temp_data[, <span class="kw">which</span>(<span class="kw">colnames</span>(temp_data) <span class="op">==</span><span class="st"> </span>feat_name)] <-<span class="st"> </span><span class="co"># Shuffles the values</span></span>
<span id="cb203-6"><a href="interp.html#cb203-6"></a><span class="st"> </span><span class="kw">sample</span>(temp_data[, <span class="kw">which</span>(<span class="kw">colnames</span>(temp_data) <span class="op">==</span><span class="st"> </span>feat_name)]</span>
<span id="cb203-7"><a href="interp.html#cb203-7"></a> <span class="op">%>%</span><span class="st"> </span><span class="kw">pull</span>(<span class="dv">1</span>), <span class="dt">replace =</span> <span class="ot">FALSE</span>)</span>
<span id="cb203-8"><a href="interp.html#cb203-8"></a> x_penalized_temp <-<span class="st"> </span>temp_data <span class="op">%>%</span><span class="st"> </span><span class="kw">as.matrix</span>() <span class="co"># Predictors into matrix</span></span>
<span id="cb203-9"><a href="interp.html#cb203-9"></a> l[i] <-<span class="st"> </span><span class="kw">mean</span>((y_penalized_train<span class="op">-</span><span class="kw">predict</span>(fit_ridge_<span class="dv">0</span>, x_penalized_temp))<span class="op">^</span><span class="dv">2</span>) <span class="co"># = Loss</span></span>
<span id="cb203-10"><a href="interp.html#cb203-10"></a>}</span></code></pre></div>
<p>Finally, we plot the results.</p>
<div class="sourceCode" id="cb204"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb204-1"><a href="interp.html#cb204-1"></a><span class="kw">data.frame</span>(<span class="dt">Feature =</span> VI_trees[,<span class="dv">1</span>], <span class="dt">loss =</span> l <span class="op">-</span><span class="st"> </span>l_star) <span class="op">%>%</span></span>
<span id="cb204-2"><a href="interp.html#cb204-2"></a><span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> Feature, <span class="dt">y =</span> loss)) <span class="op">+</span><span class="st"> </span><span class="kw">geom_col</span>() <span class="op">+</span></span>
<span id="cb204-3"><a href="interp.html#cb204-3"></a><span class="st"> </span><span class="kw">theme</span>(<span class="dt">axis.text.x =</span> <span class="kw">element_text</span>(<span class="dt">angle =</span> <span class="dv">35</span>, <span class="dt">hjust =</span> <span class="dv">1</span>))</span></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:VIglobal2"></span>
<img src="ML_factor_files/figure-html/VIglobal2-1.png" alt="Variable importance for a ridge regression model." width="450px" />
<p class="caption">
FIGURE 13.3: Variable importance for a ridge regression model.
</p>
</div>
<p>The resulting importances are in line with thoses of the tree-based models: the most prominent variables are volatility-based, market capitalization-based, and the price-to-book ratio; these closely match the variables from Figure <a href="interp.html#fig:VItrees">13.2</a>. Note that in some cases (e.g., the share turnover), the score can even be negative, which means that the predictions are more accurate than the baseline model when the values of the predictor are shuffled!</p>
</div>
<div id="partial-dependence-plot" class="section level3">
<h3><span class="header-section-number">13.1.4</span> Partial dependence plot</h3>
<p>
Partial dependence plots (PDPs) aim at showing the relationship between the output of a model and the value of a feature (we refer to section 8.2 of <span class="citation">Friedman (<a href="#ref-friedman2001greedy" role="doc-biblioref">2001</a>)</span> for an early treatment of this subject).</p>
<p>Let us fix a feature <span class="math inline">\(k\)</span>. We want to understand the <strong>average impact</strong> of <span class="math inline">\(k\)</span> on the predictions of the trained model <span class="math inline">\(\hat{f}\)</span>. In order to do so, we assume that the feature space is random and we split it in two: <span class="math inline">\(k\)</span> versus <span class="math inline">\(-k\)</span>, which stands for all features except for <span class="math inline">\(k\)</span>. The partial dependence plot is defined as</p>
<p><span class="math display" id="eq:pdp">\[\begin{equation}
\tag{13.1}
\bar{f}_k(x_k)=\mathbb{E}[\hat{f}(\textbf{x}_{-k},x_k)]=\int \hat{f}(\textbf{x}_{-k},x_k)d\mathbb{P}_{-k}(\textbf{x}_{-k}),
\end{equation}\]</span></p>
<p>where <span class="math inline">\(d\mathbb{P}_{-k}(\cdot)\)</span> is the (multivariate) distribution of the non-<span class="math inline">\(k\)</span> features <span class="math inline">\(\textbf{x}_{-k}\)</span>. The above function takes the feature values <span class="math inline">\(x_k\)</span> as argument and keeps all other features frozen via their sample distributions: this shows the impact of feature <span class="math inline">\(k\)</span> solely. In practice, the average is evaluated using Monte-Carlo simulations:</p>
<p><span class="math display" id="eq:pdpMC">\[\begin{equation}
\tag{13.2}
\bar{f}_k(x_k)\approx \frac{1}{M}\sum_{m=1}^M\hat{f}\left(x_k,\textbf{x}_{-k}^{(m)}\right),
\end{equation}\]</span>
where <span class="math inline">\(\textbf{x}_{-k}^{(m)}\)</span> are independent samples of the non-<span class="math inline">\(k\)</span> features.</p>
<p>Theoretically, PDPs could be computed for more than one feature at a time. In practice, this is only possible for two features (yielding a 3D surface) and is more computationally intense.</p>
<p>We illustrate this concept below, using the dedicated package <em>iml</em> (interpretable machine learning); see also the <em>pdp</em> package documented in <span class="citation">Greenwell (<a href="#ref-greenwell2017pdp" role="doc-biblioref">2017</a>)</span>. The model we seek to explain is the random forest built in Section <a href="trees.html#random-forests">6.2</a>. We recycle some variables used therein. We choose to test the impact of the price-to-book ratio on the outcome of the model.</p>
<div class="sourceCode" id="cb205"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb205-1"><a href="interp.html#cb205-1"></a><span class="kw">library</span>(iml) <span class="co"># One package for interpretability</span></span>
<span id="cb205-2"><a href="interp.html#cb205-2"></a>mod_iml <-<span class="st"> </span>Predictor<span class="op">$</span><span class="kw">new</span>(fit_RF, <span class="co"># This line encapsulates the objects</span></span>
<span id="cb205-3"><a href="interp.html#cb205-3"></a> <span class="dt">data =</span> training_sample <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features))</span>
<span id="cb205-4"><a href="interp.html#cb205-4"></a>pdp_PB =<span class="st"> </span>FeatureEffect<span class="op">$</span><span class="kw">new</span>(mod_iml, <span class="dt">feature =</span> <span class="st">"Pb"</span>) <span class="co"># This line computes the PDP for p/b ratio</span></span>
<span id="cb205-5"><a href="interp.html#cb205-5"></a><span class="kw">plot</span>(pdp_PB) <span class="co"># Plot the partial dependence.</span></span></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:pdp"></span>
<img src="ML_factor_files/figure-html/pdp-1.png" alt="Partial dependence plot for the price-to-book ratio on the random forest model." width="450px" />
<p class="caption">
FIGURE 13.4: Partial dependence plot for the price-to-book ratio on the random forest model.
</p>
</div>
<p>The average impact of the price-to-book ratio on the predictions is decreasing. This was somewhat expected, given the conditional average of the dependent variable given the price-to-book ratio. This latter function is depicted in Figure <a href="trees.html#fig:rpart3mkt">6.3</a> and shows a behavior comparable to the above curve: strongly decreasing for small value of P/B and then relatively flat. When the price-to-book ratio is low, firms are undervalued. Hence, their higher returns are in line with the <em>value</em> premium.</p>
<p>Finally, we refer to <span class="citation">Zhao and Hastie (<a href="#ref-zhao2019causal" role="doc-biblioref">2020</a>)</span> for a theoretical discussion on the <em>causality</em> property of PDPs. Indeed, a deep look at the construction of the PDPs suggests that they could be interpreted as a causal representation of the feature on the model’s output.</p>
</div>
</div>
<div id="local-interpretations" class="section level2">
<h2><span class="header-section-number">13.2</span> Local interpretations</h2>
<p>Whereas global interpretations seek to assess the impact of features on the output <span class="math inline">\(overall\)</span>, local methods try to quantify the behavior of the model on particular instances or the neighborhood thereof. Local interpretability has recently gained traction and many papers have been published on this topic. Below, we outline the most widespread methods.<a href="#fn27" class="footnote-ref" id="fnref27"><sup>27</sup></a></p>
<div id="lime" class="section level3">
<h3><span class="header-section-number">13.2.1</span> LIME</h3>
<p>
LIME (Local Interpretable Model-Agnostic Explanations) is a methodology originally proposed by <span class="citation">Ribeiro, Singh, and Guestrin (<a href="#ref-ribeiro2016should" role="doc-biblioref">2016</a>)</span>. Their aim is to provide
a faithful account of the model under two constraints:</p>
<ul>
<li><strong>simple interpretability</strong>, which implies a limited number of variables with visual or textual representation. This is to make sure any human can easily understand the outcome of the tool;</li>
<li><strong>local faithfulness</strong>: the explanation holds for the vicinity of the instance.</li>
</ul>
<p>The original (black-box) model is <span class="math inline">\(f\)</span> and we assume we want to approximate its behavior around instance <span class="math inline">\(x\)</span> with the interpretable model <span class="math inline">\(g\)</span>. The simple function <span class="math inline">\(g\)</span> belongs to a larger class <span class="math inline">\(G\)</span>. The vicinity of <span class="math inline">\(x\)</span> is denoted <span class="math inline">\(\pi_x\)</span> and the complexity of <span class="math inline">\(g\)</span> is written <span class="math inline">\(\Omega(g)\)</span>. LIME seeks an interpretation of the form
<span class="math display">\[\xi(x)=\underset{g \in G}{\text{argmin}} \, \mathcal{L}(f,g,\pi_x)+\Omega(g),\]</span>
where <span class="math inline">\(\mathcal{L}(f,g,\pi_x)\)</span> is the loss function (error/imprecision) induced by <span class="math inline">\(g\)</span> in the vicinity <span class="math inline">\(\pi_x\)</span> of <span class="math inline">\(x\)</span>. The penalization <span class="math inline">\(\Omega(g)\)</span> is for instance the number of leaves or depth of a tree, or the number of predictors in a linear regression.</p>
<p>It now remains to define some of the above terms. The vicinity of <span class="math inline">\(x\)</span> is defined by <span class="math inline">\(\pi_x(z)=e^{-D(x,z)^2/\sigma^2},\)</span> where <span class="math inline">\(D\)</span> is some distance measure and <span class="math inline">\(\sigma^2\)</span> some scaling constant. We underline that this function decreases when <span class="math inline">\(z\)</span> shifts away from <span class="math inline">\(x\)</span>.</p>
<p>The tricky part is the loss function. In order to minimize it, LIME generates artificial samples close to <span class="math inline">\(x\)</span> and averages/sums the error on the label that the simple representation makes. For simplicity, we assume a scalar output for <span class="math inline">\(f\)</span>, hence the formulation is the following:
<span class="math display">\[\mathcal{L}(f,g,\pi_x)=\sum_z \pi_x(z)(f(z)-g(z))^2\]</span>
and the errors are weighted according to their distance from the initial instance <span class="math inline">\(x\)</span>: the closest points get the largest weights. In its most basic implementation, the set of models <span class="math inline">\(G\)</span> consists of all linear models.</p>
<p>In Figure <a href="interp.html#fig:lime">13.5</a>, we provide a simplified diagram of how LIME works. </p>
<div class="figure" style="text-align: center"><span id="fig:lime"></span>
<img src="images/lime.png" alt="Simplistic explanation of LIME: the explained instance is surrounded by a red square. Five points are generated (the triangles) and a weighted linear model is fitted accordingly (dashed grey line)." width="300px" />
<p class="caption">
FIGURE 13.5: Simplistic explanation of LIME: the explained instance is surrounded by a red square. Five points are generated (the triangles) and a weighted linear model is fitted accordingly (dashed grey line).
</p>
</div>
<p>For expositional clarity, we work with only one dependent variable. The original training sample is shown with the black points. The fitted (trained) model is represented with the blue line (smoothed conditional average) and we want to approximate how the model works around one particular instance which is highlighted by the red square around it. In order to build the approximation, we sample 5 new points around the instance (the 5 red triangles). Each triangle lies on the blue line (they are model predictions) and has a weight proportional to its size: the triangle closest to the instance has a bigger weight. Using weighted least-squares, we build a linear model that fits to these 5 points (the dashed grey line). This is the outcome of the approximation. It gives the two parameters of the model: the intercept and the slope. Both can be evaluated with standard statistical tests.</p>
<p>The sign of the slope is important. It is fairly clear that if the instance had been taken closer to <span class="math inline">\(x=0\)</span>, the slope would have probably been almost flat and hence the predictor could be locally discarded. Another important detail is the number of sample points. In our explanation, we take only five, but in practice, a robust estimation usually requires around one thousand points or more. Indeed, when too few neighbors are sampled, the estimation risk is high and the approximation may be rough.</p>
<p>We proceed with an example of implementation. There are several steps:</p>
<ol style="list-style-type: decimal">
<li>Fit a model on some training data.<br />
</li>
<li>Wrap everything using the lime() function.<br />
</li>
<li>Focus on a few predictors and see their impact over a few particular instances (via the explain() function).</li>
</ol>
<p>We start with the first step. This time, we work with a boosted tree model. </p>
<div class="sourceCode" id="cb206"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb206-1"><a href="interp.html#cb206-1"></a><span class="kw">library</span>(lime) <span class="co"># Package for LIME interpretation</span></span>
<span id="cb206-2"><a href="interp.html#cb206-2"></a>params_xgb <-<span class="st"> </span><span class="kw">list</span>( <span class="co"># Parameters of the boosted tree</span></span>
<span id="cb206-3"><a href="interp.html#cb206-3"></a> <span class="dt">max_depth =</span> <span class="dv">5</span>, <span class="co"># Max depth of each tree</span></span>
<span id="cb206-4"><a href="interp.html#cb206-4"></a> <span class="dt">eta =</span> <span class="fl">0.5</span>, <span class="co"># Learning rate </span></span>
<span id="cb206-5"><a href="interp.html#cb206-5"></a> <span class="dt">gamma =</span> <span class="fl">0.1</span>, <span class="co"># Penalization</span></span>
<span id="cb206-6"><a href="interp.html#cb206-6"></a> <span class="dt">colsample_bytree =</span> <span class="dv">1</span>, <span class="co"># Proportion of predictors to be sampled (1 = all)</span></span>
<span id="cb206-7"><a href="interp.html#cb206-7"></a> <span class="dt">min_child_weight =</span> <span class="dv">10</span>, <span class="co"># Min number of instances in each node</span></span>
<span id="cb206-8"><a href="interp.html#cb206-8"></a> <span class="dt">subsample =</span> <span class="dv">1</span>) <span class="co"># Proportion of instance to be sampled (1 = all)</span></span>
<span id="cb206-9"><a href="interp.html#cb206-9"></a>xgb_model <-<span class="st"> </span><span class="kw">xgb.train</span>(params_xgb, <span class="co"># Training of the model</span></span>
<span id="cb206-10"><a href="interp.html#cb206-10"></a> train_matrix_xgb, <span class="co"># Training data</span></span>
<span id="cb206-11"><a href="interp.html#cb206-11"></a> <span class="dt">nrounds =</span> <span class="dv">10</span>) <span class="co"># Number of trees</span></span></code></pre></div>
<p>Then, we head on to steps two and three. As underlined above, we resort to the lime() and explain() functions.</p>
<div class="sourceCode" id="cb207"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb207-1"><a href="interp.html#cb207-1"></a>explainer <-<span class="st"> </span><span class="kw">lime</span>(training_sample <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short), xgb_model) <span class="co"># Step 2.</span></span>
<span id="cb207-2"><a href="interp.html#cb207-2"></a>explanation <-<span class="st"> </span><span class="kw">explain</span>(<span class="dt">x =</span> training_sample <span class="op">%>%</span><span class="st"> </span><span class="co"># Step 3.</span></span>
<span id="cb207-3"><a href="interp.html#cb207-3"></a><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short) <span class="op">%>%</span></span>
<span id="cb207-4"><a href="interp.html#cb207-4"></a><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">slice</span>(<span class="dv">1</span><span class="op">:</span><span class="dv">2</span>), <span class="co"># First two instances in train_sample </span></span>
<span id="cb207-5"><a href="interp.html#cb207-5"></a> <span class="dt">explainer =</span> explainer, <span class="co"># Explainer variable created above </span></span>
<span id="cb207-6"><a href="interp.html#cb207-6"></a> <span class="dt">n_permutations =</span> <span class="dv">900</span>, <span class="co"># Nb samples for loss function</span></span>
<span id="cb207-7"><a href="interp.html#cb207-7"></a> <span class="dt">dist_fun =</span> <span class="st">"euclidean"</span>, <span class="co"># Dist.func. "gower" is one alternative</span></span>
<span id="cb207-8"><a href="interp.html#cb207-8"></a> <span class="dt">n_features =</span> <span class="dv">6</span> <span class="co"># Nb of features shown (important ones)</span></span>
<span id="cb207-9"><a href="interp.html#cb207-9"></a>)</span>
<span id="cb207-10"><a href="interp.html#cb207-10"></a><span class="kw">plot_features</span>(explanation, <span class="dt">ncol =</span> <span class="dv">1</span>) <span class="co"># Visual display</span></span></code></pre></div>
<p><img src="ML_factor_files/figure-html/lime1-1.png" width="450px" style="display: block; margin: auto;" /></p>
<p>In each graph (one graph corresponds to the explanation around one instance), there are two types of information: the sign of the impact and the magnitude of the impact. The sign is revealed with the color (positive in blue, negative in red) and the magnitude is shown with the size of the rectangles.</p>
<p>The values to the left of the graphs show the ranges of the features with which the local approximations were computed. </p>
<p>Lastly, we briefly discuss the choice of distance function chosen in the code. It is used to evaluate the discrepancy between the true instance and a simulated one to give more or less weight to the prediction of the sampled instance. Our dataset comprises only numerical data; hence, the Euclidean distance is a natural choice:</p>
<p><span class="math display">\[\text{Euclidean}(\textbf{x}, \textbf{y})=\sqrt{\sum_{n=1}^N(x_i-y_i)^2}.\]</span>
Another possible choice would be the Manhattan distance:
<span class="math display">\[\text{Manhattan}(\textbf{x}, \textbf{y})=\sum_{n=1}^N|x_i-y_i|.\]</span></p>
<p>The problem with these two distances is that they fail to handle categorical variables. This is where the Gower distance steps in (<span class="citation">Gower (<a href="#ref-gower1971general" role="doc-biblioref">1971</a>)</span>). The distance imposes a different treatment on features of different types (classes versus numbers essentially, but it can also handle missing data!). For categorical features, the Gower distance applies a binary treatment: the value is equal to 1 if the features are equal, and to zero if not (i.e., <span class="math inline">\(1_{\{x_n=y_n\}}\)</span>). For numerical features, the spread is quantified as <span class="math inline">\(1-\frac{|x_n-y_n|}{R_n}\)</span>, where <span class="math inline">\(R_n\)</span> is the maximum absolute value the feature can take. All similarity measurements are then aggregated to yield the final score. Note that in this case, the logic is reversed: <span class="math inline">\(\textbf{x}\)</span> and <span class="math inline">\(\textbf{y}\)</span> are very close if the Gower distance is close to one, and they are far away if the distance is close to zero.</p>
</div>
<div id="shapley-values" class="section level3">
<h3><span class="header-section-number">13.2.2</span> Shapley values</h3>
<p>
The approach of Shapley values is somewhat different compared to LIME and closer in spirit to PDPs. It originates from cooperative game theory (<span class="citation">Shapley (<a href="#ref-shapley1953value" role="doc-biblioref">1953</a>)</span>). The rationale is the following. One way to assess the impact (or usefulness) of a variable is to look at what happens if we remove this variable from the dataset. If this is very detrimental to the quality of the model (i.e., to the accuracy of its predictions), then it means that the variable is substantially valuable.</p>
<p>The simplest way to proceed is to take all variables and remove one to evaluate its predictive ability. Shapley values are computed on a larger scale because they consider all possible combinations of variables to which they add the target predictor. Formally, this gives:</p>
<p><span class="math display" id="eq:shapley">\[\begin{equation}
\tag{13.3}
\phi_k=\sum_{S \subseteq \{x_1,\dots,x_K \} \backslash x_k}\underbrace{\frac{\text{Card}(S)!(K-\text{Card}(S)-1)!}{K!}}_{\text{weight of coalition}}\underbrace{\left(\hat{f}_{S \cup \{x_k\}}(S \cup \{x_k\})-\hat{f}_S(S)\right)}_{\text{gain when adding } x_k}
\end{equation}\]</span></p>
<span class="math inline">\(S\)</span> is any subset of the that doesn’t include feature <span class="math inline">\(k\)</span> and its size is Card(<span class="math inline">\(S\)</span>).<br />
In the equation above, the model <span class="math inline">\(f\)</span> must be altered because it’s impossible to evaluate <span class="math inline">\(f\)</span> when features are missing. In this case, there are several possible options:
<p>Obviously, Shapley values can take a lot of time to compute if the number of predictors is large. We refer to <span class="citation">Chen et al. (<a href="#ref-chen2018shapley" role="doc-biblioref">2018</a>)</span> for a discussion on a simplifying method that reduces computation times in this case.
Extensions of Shapley values for interpretability are studied in <span class="citation">Lundberg and Lee (<a href="#ref-lundberg2017unified" role="doc-biblioref">2017</a>)</span>.</p>
<p>
The implementation of Shapley values is permitted in R via the <em>iml</em> package. There are two restrictions compared to LIME. First, the features must be filtered upfront because all features are shown on the graph (which becomes illegible beyond 20 features). This is why in the code below, we use the short list of predictors (from Section <a href="notdata.html#dataset">1.2</a>).
Second, instances are analyzed one at a time.</p>
<p>We start by fitting a random forest model.</p>
<div class="sourceCode" id="cb208"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb208-1"><a href="interp.html#cb208-1"></a>fit_RF_short <-<span class="st"> </span><span class="kw">randomForest</span>(R1M_Usd <span class="op">~</span>., <span class="co"># Same formula as for simple trees!</span></span>
<span id="cb208-2"><a href="interp.html#cb208-2"></a> <span class="dt">data =</span> training_sample <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(<span class="kw">c</span>(features_short), <span class="st">"R1M_Usd"</span>), </span>
<span id="cb208-3"><a href="interp.html#cb208-3"></a> <span class="dt">sampsize =</span> <span class="dv">10000</span>, <span class="co"># Size of (random) sample for each tree</span></span>
<span id="cb208-4"><a href="interp.html#cb208-4"></a> <span class="dt">replace =</span> <span class="ot">FALSE</span>, <span class="co"># Is the sampling done with replacement?</span></span>
<span id="cb208-5"><a href="interp.html#cb208-5"></a> <span class="dt">nodesize =</span> <span class="dv">250</span>, <span class="co"># Minimum size of terminal cluster</span></span>
<span id="cb208-6"><a href="interp.html#cb208-6"></a> <span class="dt">ntree =</span> <span class="dv">40</span>, <span class="co"># Nb of random trees</span></span>
<span id="cb208-7"><a href="interp.html#cb208-7"></a> <span class="dt">mtry =</span> <span class="dv">4</span> <span class="co"># Nb of predictive variables for each tree</span></span>
<span id="cb208-8"><a href="interp.html#cb208-8"></a> )</span></code></pre></div>
<p>We can then analyze the behavior of the model around the first instance of the training sample.</p>
<div class="sourceCode" id="cb209"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb209-1"><a href="interp.html#cb209-1"></a>predictor <-<span class="st"> </span>Predictor<span class="op">$</span><span class="kw">new</span>(fit_RF_short, <span class="co"># This wraps the model & data</span></span>
<span id="cb209-2"><a href="interp.html#cb209-2"></a> <span class="dt">data =</span> training_sample <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short), </span>
<span id="cb209-3"><a href="interp.html#cb209-3"></a> <span class="dt">y =</span> training_sample<span class="op">$</span>R1M_Usd)</span>
<span id="cb209-4"><a href="interp.html#cb209-4"></a>shapley <-<span class="st"> </span>Shapley<span class="op">$</span><span class="kw">new</span>(predictor, <span class="co"># Compute the Shapley values...</span></span>
<span id="cb209-5"><a href="interp.html#cb209-5"></a> <span class="dt">x.interest =</span> training_sample <span class="op">%>%</span><span class="st"> </span></span>
<span id="cb209-6"><a href="interp.html#cb209-6"></a><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short) <span class="op">%>%</span></span>
<span id="cb209-7"><a href="interp.html#cb209-7"></a><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">slice</span>(<span class="dv">1</span>)) <span class="co"># On the first instance</span></span>
<span id="cb209-8"><a href="interp.html#cb209-8"></a><span class="kw">plot</span>(shapley) <span class="op">+</span><span class="st"> </span><span class="kw">coord_fixed</span>(<span class="dv">1500</span>) <span class="op">+</span><span class="st"> </span><span class="co"># Plot</span></span>
<span id="cb209-9"><a href="interp.html#cb209-9"></a><span class="st"> </span><span class="kw">theme</span>(<span class="dt">axis.text.x =</span> <span class="kw">element_text</span>(<span class="dt">angle =</span> <span class="dv">35</span>, <span class="dt">hjust =</span> <span class="dv">1</span>)) <span class="op">+</span><span class="st"> </span><span class="kw">coord_flip</span>() </span></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:shapley"></span>
<img src="ML_factor_files/figure-html/shapley-1.png" alt="Illustration of the Shapley method." width="450px" />
<p class="caption">
FIGURE 13.6: Illustration of the Shapley method.
</p>
</div>
<p>In the output shown in Figure <a href="interp.html#fig:shapley">13.6</a>, we again obtain the two crucial insights: <strong>sign</strong> of the impact of the feature and <strong>relative importance</strong> (compared to other features).</p>
</div>
<div id="breakdown" class="section level3">
<h3><span class="header-section-number">13.2.3</span> Breakdown</h3>
<p>
Breakdown (see, e.g., <span class="citation">Staniak and Biecek (<a href="#ref-staniak2018explanations" role="doc-biblioref">2018</a>)</span>) is a mixture of ideas from PDPs and Shapley values. The core of breakdown is the so-called <strong>relaxed model prediction</strong> defined in Equation <a href="interp.html#eq:breakdown">(13.4)</a>. It is close in spirit to Equation <a href="interp.html#eq:pdp">(13.1)</a>. The difference is that we are working at the local level, i.e., on one particular observation, say <span class="math inline">\(x^*\)</span>. We want to measure the impact of a set of predictors on the prediction associated to <span class="math inline">\(x^*\)</span>; hence, we fix two sets <span class="math inline">\(\textbf{k}\)</span> (fixed features) and <span class="math inline">\(-\textbf{k}\)</span> (free features) and evaluate a <strong>proxy</strong> for the average prediction of the estimated model <span class="math inline">\(\hat{f}\)</span> when the set <span class="math inline">\(\textbf{k}\)</span> of predictors is fixed at the values of <span class="math inline">\(x^*\)</span>, that is, equal to <span class="math inline">\(x^*_{\textbf{k}}\)</span> in the expression below:</p>
<p><span class="math display" id="eq:breakdown">\[\begin{equation}
\tag{13.4}
\tilde{f}_{\textbf{k}}(x^*)=\frac{1}{M}\sum_{m=1}^M \hat{f}\left(x^{(m)}_{-\textbf{k}},x^*_{\textbf{k}} \right).
\end{equation}\]</span></p>
<p>The <span class="math inline">\(x^{(m)}\)</span> in the above expression are either simulated values of instances or simply sampled values from the dataset. The notation implies that the instance has some values replaced by those of <span class="math inline">\(x^*\)</span>, namely those that correspond to the indices <span class="math inline">\(\textbf{k}\)</span>. When <span class="math inline">\(\textbf{k}\)</span> consists of all features, then <span class="math inline">\(\tilde{f}_{\textbf{k}}(x^*)\)</span> is equal to the raw model prediction <span class="math inline">\(\hat{f}(x^*)\)</span> and when <span class="math inline">\(\textbf{k}\)</span> is empty, it is equal to the average sample value of the label (constant prediction).</p>
<p>The quantity of interest is the so-called contribution of feature <span class="math inline">\(j\notin \textbf{k}\)</span> with respect to data point <span class="math inline">\(x^*\)</span> and set <span class="math inline">\(\textbf{k}\)</span>:</p>
<p><span class="math display">\[\phi_{\textbf{k}}^j(x^*)=\tilde{f}_{\textbf{k} \cup j}(x^*)-\tilde{f}_{\textbf{k}}(x^*).\]</span></p>
<p>Just as for Shapley values, the above indicator computes an average impact when augmenting the set of predictors with feature <span class="math inline">\(j\)</span>. By definition, it depends on the set <span class="math inline">\(\textbf{k}\)</span>, so this is one notable difference with Shapley values (that span <em>all</em> permutations). In <span class="citation">Staniak and Biecek (<a href="#ref-staniak2018explanations" role="doc-biblioref">2018</a>)</span>, the authors devise a procedure that incrementally increases or decreases the set <span class="math inline">\(\textbf{k}\)</span>. This greedy idea helps alleviate the burden of computing all possible combinations of features. Moreover, a very convenient property of their algorithm is that the sum of all contributions is equal to the predicted value:
<span class="math display">\[\sum_j \phi_{\textbf{k}}^j(x^*)=f(x^*).\]</span></p>
<p>The visualization makes that very easy to see (as in Figure <a href="interp.html#fig:breakdown">13.7</a> below).</p>
<p>In order to illustrate one implementation of breakdown, we train a random forest on a limited number of features, as shown below. This will increase the readability of the output of the breakdown.</p>
<div class="sourceCode" id="cb210"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb210-1"><a href="interp.html#cb210-1"></a>formula_short <-<span class="st"> </span><span class="kw">paste</span>(<span class="st">"R1M_Usd ~"</span>, <span class="kw">paste</span>(features_short, <span class="dt">collapse =</span> <span class="st">" + "</span>)) <span class="co"># Model </span></span>
<span id="cb210-2"><a href="interp.html#cb210-2"></a>formula_short <-<span class="st"> </span><span class="kw">as.formula</span>(formula_short) <span class="co"># Formula format</span></span>
<span id="cb210-3"><a href="interp.html#cb210-3"></a>fit_RF_short <-<span class="st"> </span><span class="kw">randomForest</span>(formula_short, <span class="co"># Same formula as before</span></span>
<span id="cb210-4"><a href="interp.html#cb210-4"></a> <span class="dt">data =</span> dplyr<span class="op">::</span><span class="kw">select</span>(training_sample, <span class="kw">c</span>(features_short, <span class="st">"R1M_Usd"</span>)), </span>
<span id="cb210-5"><a href="interp.html#cb210-5"></a> <span class="dt">sampsize =</span> <span class="dv">10000</span>, <span class="co"># Size of (random) sample for each tree</span></span>
<span id="cb210-6"><a href="interp.html#cb210-6"></a> <span class="dt">replace =</span> <span class="ot">FALSE</span>, <span class="co"># Is the sampling done with replacement?</span></span>
<span id="cb210-7"><a href="interp.html#cb210-7"></a> <span class="dt">nodesize =</span> <span class="dv">250</span>, <span class="co"># Minimum size of terminal cluster</span></span>
<span id="cb210-8"><a href="interp.html#cb210-8"></a> <span class="dt">ntree =</span> <span class="dv">12</span>, <span class="co"># Nb of random trees</span></span>
<span id="cb210-9"><a href="interp.html#cb210-9"></a> <span class="dt">mtry =</span> <span class="dv">5</span> <span class="co"># Nb of predictive variables for each tree</span></span>
<span id="cb210-10"><a href="interp.html#cb210-10"></a> )</span></code></pre></div>
<p>Once the model is trained, the syntax for the breakdown of predictions is very simple.</p>
<div class="sourceCode" id="cb211"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb211-1"><a href="interp.html#cb211-1"></a><span class="kw">library</span>(breakDown)</span>
<span id="cb211-2"><a href="interp.html#cb211-2"></a>explain_break <-<span class="st"> </span><span class="kw">broken</span>(fit_RF_short, </span>
<span id="cb211-3"><a href="interp.html#cb211-3"></a> data_ml[<span class="dv">6</span>,] <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short),</span>
<span id="cb211-4"><a href="interp.html#cb211-4"></a> <span class="dt">data =</span> data_ml <span class="op">%>%</span><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">select</span>(features_short))</span>
<span id="cb211-5"><a href="interp.html#cb211-5"></a><span class="kw">plot</span>(explain_break) </span></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:breakdown"></span>
<img src="ML_factor_files/figure-html/breakdown-1.png" alt="Example of a breakdown output." width="500px" />
<p class="caption">
FIGURE 13.7: Example of a breakdown output.
</p>
</div>
<p>The graphical output is intuitively interpreted. The grey bar is the prediction of the model at the chosen instance. Green bars signal a positive contribution and the yellowish rectangles show the variables with negative impact. The relative sizes indicate the importance of each feature.</p>
</div>
</div>
</div>
<h3>References</h3>
<div id="refs" class="references">
<div id="ref-belle2020principles">
<p>Belle, Vaishak, and Ioannis Papantonis. 2020. “Principles and Practice of Explainable Machine Learning.” <em>arXiv Preprint</em>, no. 2009.11698.</p>
</div>
<div id="ref-chen2018shapley">
<p>Chen, Jianbo, Le Song, Martin J Wainwright, and Michael I Jordan. 2018. “L-Shapley and c-Shapley: Efficient Model Interpretation for Structured Data.” <em>arXiv Preprint</em>, no. 1808.02610.</p>
</div>
<div id="ref-fisher2018all">
<p>Fisher, Aaron, Cynthia Rudin, and Francesca Dominici. 2019. “All Models Are Wrong, but Many Are Useful: Learning a Variable’s Importance by Studying an Entire Class of Prediction Models Simultaneously.” <em>Journal of Machine Learning Research</em> 20 (177): 1–81.</p>
</div>
<div id="ref-friedman2001greedy">
<p>Friedman, Jerome H. 2001. “Greedy Function Approximation: A Gradient Boosting Machine.” <em>Annals of Statistics</em>, 1189–1232.</p>
</div>
<div id="ref-gower1971general">
<p>Gower, John C. 1971. “A General Coefficient of Similarity and Some of Its Properties.” <em>Biometrics</em>, 857–71.</p>
</div>
<div id="ref-greenwell2017pdp">
<p>Greenwell, Brandon M. 2017. “Pdp: An R Package for Constructing Partial Dependence Plots.” <em>R Journal</em> 9 (1): 421–36.</p>
</div>
<div id="ref-greenwell2020variable">
<p>Greenwell, Brandon M, and Bradley C Boehmke. n.d. “Variable Importance Plots: An Introduction to the Vip Package.” <em>R Journal</em>.</p>
</div>
<div id="ref-hall2019introduction">
<p>Hall, Patrick, and Navdeep Gill. 2019. <em>An Introduction to Machine Learning Interpretability - Second Edition</em>. O’Reilly.</p>
</div>
<div id="ref-lundberg2017unified">
<p>Lundberg, Scott M, and Su-In Lee. 2017. “A Unified Approach to Interpreting Model Predictions.” In <em>Advances in Neural Information Processing Systems</em>, 4765–74.</p>
</div>
<div id="ref-molnar2019interpretable">
<p>Molnar, Christoph. 2019. <em>Interpretable Machine Learning: A Guide for Making Black Box Models Explainable</em>. LeanPub / Lulu.</p>
</div>
<div id="ref-molnar2018iml">
<p>Molnar, Christoph, Giuseppe Casalicchio, and Bernd Bischl. 2018. “Iml: An R Package for Interpretable Machine Learning.” <em>Journal of Open Source Software</em> 3 (27): 786.</p>
</div>
<div id="ref-ribeiro2016should">
<p>Ribeiro, Marco Tulio, Sameer Singh, and Carlos Guestrin. 2016. “Why Should I Trust You?: Explaining the Predictions of Any Classifier.” In <em>Proceedings of the 22nd Acm Sigkdd International Conference on Knowledge Discovery and Data Mining</em>, 1135–44. ACM.</p>
</div>
<div id="ref-shapley1953value">
<p>Shapley, Lloyd S. 1953. “A Value for N-Person Games.” <em>Contributions to the Theory of Games</em> 2 (28): 307–17.</p>
</div>
<div id="ref-staniak2018explanations">
<p>Staniak, Mateusz, and Przemyslaw Biecek. 2018. “Explanations of Model Predictions with Live and breakDown Packages.” <em>arXiv Preprint</em>, no. 1804.01955.</p>
</div>
<div id="ref-vidal2020born">
<p>Vidal, Thibaut, Toni Pacheco, and Maximilian Schiffer. 2020. “Born-Again Tree Ensembles.” <em>arXiv Preprint</em>, no. 2003.11132.</p>
</div>
<div id="ref-zhao2019causal">
<p>Zhao, Qingyuan, and Trevor Hastie. 2020. “Causal Interpretations of Black-Box Models.” <em>Journal of Business & Economic Statistics</em> Forthcoming.</p>
</div>
</div>
<div class="footnotes">
<hr />
<ol start="27">
<li id="fn27"><p>For instance, we do not mention the work of <span class="citation">Horel and Giesecke (<a href="#ref-horel2019towards" role="doc-biblioref">2019</a>)</span> but the interested reader can have a look at their work on neural networks (and also at the references cited in the paper).<a href="interp.html#fnref27" class="footnote-back">↩︎</a></p></li>
</ol>
</div>
</section>
</div>
</div>
</div>
<a href="backtest.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
<a href="causality.html" class="navigation navigation-next " aria-label="Next page"><i class="fa fa-angle-right"></i></a>
</div>
</div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/clipboard.min.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-clipboard.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": false,
"twitter": true,
"linkedin": true,
"weibo": false,
"instapaper": false,
"vk": false,
"all": ["facebook", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": null,
"history": {
"link": null,
"text": null
},
"view": {
"link": null,
"text": null
},
"download": null,
"toc": {
"collapse": "section",
"scroll_highlight": true
},
"toolbar": {
"position": "fixed",
"download": false
},
"search": true,
"info": true
});
});
</script>
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
var src = "true";
if (src === "" || src === "true") src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML";
if (location.protocol !== "file:")
if (/^https?:/.test(src))
src = src.replace(/^https?:/, '');
script.src = src;
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
</body>
</html>