-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathindex.html
More file actions
624 lines (613 loc) · 43.7 KB
/
index.html
File metadata and controls
624 lines (613 loc) · 43.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>LiNT-II | Documentation</title>
<script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
<script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
<script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
<script src="https://cdn.jsdelivr.net/npm/prismjs/prism.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/prismjs/plugins/autoloader/prism-autoloader.min.js"></script>
<script async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<script type="module" src="https://lcvriend.github.io/wc-darkmode-toggle/src/darkmode-toggle.js"></script>
<link rel="stylesheet" href="https://latex.vercel.app/prism/prism.css">
<link rel="stylesheet" href="https://latex.vercel.app/style.css">
<style>
html, body {
max-width: 1000px !important;
margin-inline: auto;
}
body {
--max-content-width: 80ch;
lint-ii-visualizer {
--background: hsl(12, 50%, 97%);
margin-block: 2.5rem;
}
&:has(darkmode-toggle[scheme="dark"]) {
lint-ii-visualizer {
--background: hsl(12, 50%, 7%);
}
}
}
darkmode-toggle {
position: absolute;
right: 1em;
top: 0;
font-size: 1.5rem;
}
header, footer, section {
display: grid;
grid-template-columns:
minmax(1rem, 1fr) [breakout-start]
minmax(0, auto) [content-start]
min(var(--max-content-width), calc(100% - 2rem))
[content-end] minmax(0, auto)
[breakout-end] minmax(1rem, 1fr);
& > * {
grid-column: content;
}
& > .breakout {
grid-column: breakout;
overflow-x: auto;
}
}
header {
border-bottom: 1px solid currentColor;
}
input, label {
cursor: pointer;
user-select: none;
}
.back-to-top {
position: fixed;
right: 2rem;
bottom: 2rem;
width: 2.5rem;
height: 2.5rem;
z-index: 999;
background-color: transparent;
color: inherit;
font-size: 1.25rem;
cursor: pointer;
font-family: monospace;
border: 1px solid;
border-radius: 50%;
line-height: 1;
opacity: .7;
&:hover {
opacity: 1;
border: 2px solid;
}
}
.hidden {
display: none;
}
</style>
</head>
<body>
<darkmode-toggle></darkmode-toggle>
<button class="back-to-top hidden">↑</button>
<header>
<h1><strong>LiNT-II</strong>: readability assessment for Dutch</h1>
</header>
<section>
<h2>Table of contents</h2>
<ol>
<li><a href="#introduction">Introduction</a></li>
<li><a href="#demo">Demo</a></li>
<li><a href="#lint-and-lint-ii">LiNT and LiNT-II</a></li>
<li><a href="#linguistic-features">Linguistic Features</a></li>
<li><a href="#formula-scores-and-difficulty-levels">Formula, Scores and Difficulty Levels</a></li>
<li><a href="#references-and-credits">References and Credits</a></li>
</ol>
</section>
<section>
<h2 id="introduction">Introduction</h2>
<ul>
<li><strong>LiNT-II</strong> analyzes Dutch text readability using four linguistic features: word frequency, syntactic dependency length, information density (number content words per clause), and proportion of concrete vs abstract nouns. <a href="#linguistic-features"><em>Read more</em></a></li>
<li><strong>LiNT-II</strong> outputs a readability score between 0-100; the higher the score is, the more difficult the text is. The scores can be mapped to four difficulty levels: a text of Level 1 is estimated to be difficult for 14% of adult Dutch readers, while a text of Level 4 is estimated to be difficult for 78% of adult Dutch readers. <a href="#formula-scores-and-difficulty-levels"><em>Read more</em></a></li>
<li><strong>LiNT-II</strong> scores and levels are based on an empirical comprehension study, where understanding of different texts was assessed using a <em>cloze test</em> (fill-in missing words). The study involved 120 texts; 2700 Dutch high-school students participated. <a href="#lint-and-lint-ii"><em>Read more</em></a></li>
<li>For code and usage, please refer to the <a href="https://github.com/vanboefer/lint_ii">GitHub repo</a>.</li>
</ul>
</section>
<section>
<h2 id="demo">Demo</h2>
<p>Select one of the 4 texts below to see the detailed LiNT-II analysis:</p>
<div style="margin-bottom: 1.5rem;">
<label><input type="radio" name="text_examples" value="level1" checked> Level 1</label>
<label><input type="radio" name="text_examples" value="level2"> Level 2</label>
<label><input type="radio" name="text_examples" value="level3"> Level 3</label>
<label><input type="radio" name="text_examples" value="level4"> Level 4</label>
</div>
<lint-ii-visualizer id="viz" class="breakout"></lint-ii-visualizer>
<ul>
<li>The upper bar shows the score and difficulty level for the whole document.</li>
<li>Click on ∑ (right-upper corner) to see additional document-level statistics.</li>
<li>On the text itself, you can see the difficulty level per sentence; hover above it to see additional sentence-level statistics.</li>
<li>Hover above each word to see the word-level linguistic features, like word frequency, semantic type, etc. To understand the features and how they are calculated, see <a href="#linguistic-features">Linguistic Features</a>.</li>
</ul>
</section>
<section>
<h2 id="lint-and-lint-ii">LiNT and LiNT-II</h2>
<h3>Background and motivation</h3>
<p><strong>LiNT-II</strong> is a new implementation of the original <strong>LiNT</strong> (<em>Leesbaarheidsinstrument voor Nederlandse Teksten</em>) tool.</p>
<p>The original LiNT utilizes the legacy NLP pipeline <a href="https://github.com/CentreForDigitalHumanities/tscan">T-Scan</a> to extract linguistic features from text; this software is difficult to install and to run and is therefore not suitable for many use cases.</p>
<p>LiNT-II is a modern Python package, with <a href="https://spacy.io/">spaCy</a> under the hood. It can be easily installed with <a href="https://pypi.org/project/pip/"><code>pip</code></a>, and integrated into other software; it is fast and therefore suitable for production setups.</p>
<p>In order to preserve the scientific integrity of the tool, LiNT-II was developed in close collaboration with <strong>Henk Pander Maat</strong>, one of the researchers who developed the original LiNT.</p>
<h3>Original LiNT</h3>
<p>The first version of LiNT was developed in the NWO project <em>Toward a validated reading level tool for Dutch</em> (2012-2017). Later versions were developed in the <em>Digital Humanities Lab</em> of Utrecht University.</p>
<p>More details about the original LiNT can be found on:</p>
<ul>
<li><a href="https://lint.hum.uu.nl/home">LiNT (Utrecht University)</a></li>
<li><a href="https://www.gebruikercentraal.nl/hulpmiddelen/lint-leesbaarheidsinstrument-voor-nederlandse-teksten/">LiNT (Gebruiker Centraal)</a></li>
</ul>
<p>The research on which LiNT is based, including the empirical comprehension study and the development of the model, is described in:</p>
<ul>
<li><a href="https://lint.hum.uu.nl/assets/kleijn-2018.pdf">PhD thesis of Suzanne Kleijn (2018)</a> (English)</li>
<li><a href="https://www.aup-online.com/content/journals/10.5117/TVT2023.3.002.MAAT">Pander Maat et al. 2023</a> (Dutch)</li>
</ul>
<h3>LiNT-II</h3>
<p>In LiNT-II, text is processed with <a href="https://spacy.io/">spaCy</a>, instead of the original <a href="https://github.com/CentreForDigitalHumanities/tscan">T-Scan</a>. This includes, for example, splitting the text into sentences and tokens, tagging the part-of-speech of each token (noun, verb, etc.), and parsing the syntactic structure of the sentence. We use the spaCy model <a href="https://spacy.io/models/nl#nl_core_news_lg"><code>nl_core_news_lg</code></a>.</p>
<p>Processing the text with a different software affects the values of the <a href="#linguistic-features">linguistic features</a>. Therefore, we fitted a <strong>new model</strong> on the comprehension data that was collected for the original LiNT. The new model leads to a new LiNT-II formula for calculating the readability score. For more information, read <a href="#formula-scores-and-difficulty-levels">here</a>.</p>
<h2 id="linguistic-features">Linguistic Features</h2>
<h3>Overview</h3>
<p>The readability score of LiNT-II is calculated based on 4 features:</p>
<table>
<thead>
<tr>
<th>Feature</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>word frequency</strong></td>
<td>Mean word frequency of all the content words in the text (excluding proper nouns). <br>➡ Less frequent
words make a text more difficult.</td>
</tr>
<tr>
<td><strong>syntactic dependency length</strong></td>
<td>Syntactic dependency length (SDL) is the number of words between a syntactic head and its dependent
(e.g., verb-subject). We take the biggest SDL in each sentence, and calculate their mean value for the
whole text. <br>➡ Bigger SDL's make a text more difficult.</td>
</tr>
<tr>
<td><strong>content words per clause</strong></td>
<td>Mean number of content words per clause. <br>➡ Larger number of content words indicates dense
information and makes a text more difficult.</td>
</tr>
<tr>
<td><strong>proportion concrete nouns</strong></td>
<td>Mean proportion of concrete nouns out of all the nouns in the text. <br>➡ Smaller proportion of concrete
nouns (i.e. many abstract nouns) makes a text more difficult.</td>
</tr>
</tbody>
</table>
<h4>Definitions</h4>
<ul>
<li><em><strong>Content words</strong></em> are words that possess semantic content and contribute to the meaning of the sentence. We consider a word as a content word if it belongs to one of the following <a href="https://universaldependencies.org/u/pos/">part-of-speech (POS)</a>: nouns (<code>NOUN</code>), proper nouns (<code>PROPN</code>), lexical verbs (<code>VERB</code>), adjectives (<code>ADJ</code>), adverbs (<code>ADV</code>), or if it's a manner adverb (based on a <a href="https://github.com/vanboefer/lint_ii/blob/main/src/lint_ii/linguistic_data/data/manner_adverbs_20251017.parquet">custom list</a>).</li>
<li><em><strong>Clause</strong></em>: A clause is a group of words that contains a subject and a verb, functioning as a part of a sentence. In this library, the number of clauses is determined by the number of finite verbs (=verbs that show tense) in the sentence.</li>
</ul>
<h3>Word Frequency</h3>
<h4>Why word frequencies?</h4>
<p>Words that are not common in spoken language tend to be less familiar to people and therefore more difficult to process and understand. We can estimate how familiar a certain word is by measuring its frequency, i.e. counting its occurrences in a big text corpus.</p>
<h4>Choice of corpus</h4>
<p>LiNT-II calculates word frequencies from <a href="https://github.com/vanboefer/lint_ii/blob/main/src/lint_ii/linguistic_data/data/subtlex_wordfreq.parquet">SUBTLEX-NL</a> (<a href="https://link.springer.com/article/10.3758/brm.42.3.643">Keuleers et al. 2010</a>): a corpus of Dutch subtitles, which contains about 40 million words. This corpus was chosen for the original LiNT after elaborate analysis and consideration; for details, please refer to the <a href="https://raw.githubusercontent.com/CentreForDigitalHumanities/tscan/master/docs/tscanhandleiding.pdf">T-Scan manual</a> and <a href="https://lint.hum.uu.nl/assets/pander-maat-en-dekker-2016.pdf">Pander Maat & Dekker 2016</a>.</p>
<blockquote>
<p>During the development of LiNT-II, we also experimented with using frequencies from <a href="https://github.com/rspeer/wordfreq">wordfreq</a> instead of SUBTLEX-NL. The <a href="https://github.com/rspeer/wordfreq">wordfreq</a> corpus is a lot bigger and contains multiple genres: subtitles (SUBTLEX-NL, OpenSubtitles), Wikipedia, news (NewsCrawl, GlobalVoices), web text (OSCAR), social media (Twitter). However, <a href="https://github.com/rspeer/wordfreq">wordfreq</a> frequencies gave lower results when <a href="#formula-scores-and-difficulty-levels">fitting the model</a> on comprehension data. This suggests that SUBTLEX-NL might be a better approximation of spoken language than a bigger corpus that contains a lot of written language like news and Wikipedia.</p>
</blockquote>
<p>It is important to note that any corpus captures language use only partially. Since the SUBTLEX-NL corpus is based on Dutch subtitles for English-speaking shows, some words that are common in a Dutch-speaking context might be less frequent there (e.g., <em>fietser</em> "cyclist"). In addition, the shows are from the years 2000-2010; new words from the last 15 years (<em>Instagram</em>, <em>covid</em>) are not in the corpus. Additional corrections were applied to address some of these issues, as described <a href="#corrections-and-exceptions">below</a>.</p>
<h4>What do the values mean?</h4>
<p>We calculate the frequencies on a Zipf scale (<a href="https://journals.sagepub.com/doi/full/10.1080/17470218.2013.850521">Van Heuven et al. 2014</a>):</p>
\[
\text{Zipf value} = \log_{10}(\text{frequency per billion words})
\]
<p>A Zipf value of 1 means that a word appears once per 100 million words, a Zipf value of 2 means that a word appears once per 10 million words, a Zipf value of 3 means that a word appears once per million words, and so on.</p>
<p>In line with the original LiNT and <a href="https://journals.sagepub.com/doi/full/10.1080/17470218.2013.850521">Van Heuven et al. 2014</a>, we consider words with a Zipf value <strong>smaller than 3 as "uncommon"</strong>; these words appear in the SUBTLEX-NL corpus less than once per million words. Examples: <em>afdwaling</em>: 1.66, <em>napraterij</em>: 1.66.</p>
<p>The SUBTLEX-NL corpus with our calculated Zipf values can be found <a href="https://github.com/vanboefer/lint_ii/blob/main/src/lint_ii/linguistic_data/data/subtlex_wordfreq.parquet">here</a>.</p>
<h4>Corrections and exceptions</h4>
<p>The corrections and exceptions applied in LiNT-II are the same ones as in the original LiNT.</p>
<ul>
<li>We calculate word frequencies only for <a href="#definitions">content words</a>, since function words are generally frequent (for example, <em>dat</em>: 7.34, <em>de</em>: 7.38, <em>en</em>: 7.14) and do not contribute to the difficulty of the text. From the content words, we exclude proper nouns (names of people, places, etc.) since their frequency cannot be reliably linked to the difficulty of the text.</li>
<li>For transparent compounds (e.g., <em>duwboot</em> "towboat"), we use the frequency of the base word (<em>boot</em> "boat"), rather than the frequency of the compound as a whole. Previous research shows that this provides a better estimate of word difficulty; for more details, see <a href="https://lint.hum.uu.nl/assets/pander-maat-en-dekker-2016.pdf">Pander Maat & Dekker 2016</a>. The compounds and their base words are identified based on a manually-annotated <a href="https://github.com/vanboefer/lint_ii/blob/main/src/lint_ii/linguistic_data/data/nouns_sem_types_20251118.parquet">list</a>. The list contains 123,307 compounds: 63,316 singular forms and 59,991 plural forms; for the plural forms, the base word is given in singular (for example, the base word of both <em>integriteitstoets</em> "integrity test" and <em>integriteitstoetsen</em> "integrity tests" is <em>toets</em> "test").</li>
<li>As mentioned above, some words that are missing or infrequent in the SUBTLEX-NL corpus are actually pretty common in the spoken language. This includes new words that entered the Dutch language after 2010 (e.g., <em>appen</em> "to send a message on WhatsApp"), and words that are common in a Dutch-speaking context but might be not common in English-speaking TV shows (e.g., <em>knutselen</em> "to craft", <em>fietser</em> "cyclist"). To address the most obvious discrepancies of this sort, the makers of the original LiNT manually created a list of words that should be skipped when calculating frequencies. So instead of incorrectly getting a low frequency, these words don't get any frequency value at all, and so do not mistakenly affect the difficulty score. For more details on how this was done, see the <a href="https://raw.githubusercontent.com/CentreForDigitalHumanities/tscan/master/docs/tscanhandleiding.pdf">T-Scan manual</a>. The list can be found <a href="https://github.com/vanboefer/lint_ii/blob/main/src/lint_ii/linguistic_data/data/subtlex_wordfreq_skiplist.parquet">here</a>.</li>
</ul>
<h3>Syntactic Dependency Length (SDL)</h3>
<h4>Why SDL?</h4>
<p>Syntactic dependency length (SDL) is the number of words between a syntactic head and its dependent (e.g., verb-subject). The bigger the distance between a head and its dependent is, the more difficult it is to process and understand the sentence. This phenomenon is called a <a href="https://nl.wikipedia.org/wiki/Tangconstructie"><em>tangconstructie</em></a>.</p>
<h4>Calculating SDLs</h4>
<p>To calculate the SDLs in the sentence, we use the <a href="https://spacy.io/api/dependencyparser">dependency parsing</a> of spaCy. The parser of the <a href="https://spacy.io/models/nl#nl_core_news_lg">Dutch model</a> that we use was trained on the <a href="https://github.com/UniversalDependencies/UD_Dutch-Alpino">Alpino UD corpus</a>.</p>
<p>For each token in the sentence, we identify its head(s) and then count the number of intervening tokens between the token and its head. The head is generally taken from the spaCy parser, except for the two cases described <a href="#corrections-and-exceptions">below</a>. In each sentence, we take the longest SDL as an indicator of difficulty. For the document-level readability analysis, we take the mean of all the sentence-level max SDLs.</p>
<p><strong>Example</strong>: In the sentence <em>"De Oudegracht is het sfeervolle hart van de stad."</em>, the longest SDL is between the subject of the sentence <em>Oudegracht</em> and the root (main predicate) of the sentence
<em>hart</em>; the max SDL is 3 ( three intervening tokens <em>is, het, sfeervolle</em>).</p>
<h4 id="corrections-and-exceptions">Corrections and exceptions</h4>
<p>There are three cases in which we do not follow spaCy's dependency analysis:</p>
<ul>
<li>Punctuation: spaCy parser considers punctuation marks as tokens and assigns a head to them. In our analysis, we override this behavior: (a) punctuation marks are not counted as intervening tokens for SDL calculation, (b) for a punctuation mark, the dependency length is always set to 0 (instead of counting the distance to the head).</li>
<li>Conjunctions (1): In a conjunction relation, spaCy considers the first conjunct as the head of the second. For example, in the sentence <em>"Je zoekt informatie in naslagwerken via trefwoorden in de <strong>index</strong> of het <strong>register</strong>."</em>, where the words <em>index</em> and <em>register</em> are connected with the conjuction <em>of</em>, spaCy considers <em>index</em> as the head of <em>register</em>. We override this behavior: if a token is in a conjunction then the head of the last conjunct is taken recursively from the first, i.e. the head of both <em>index</em> and <em>register</em> is <em>trefwoorden</em> in our analysis.</li>
<li>Conjunctions (2): When the main predicate of the sentence (ROOT) is in a conjunction relation, spaCy connects only the first conjunct to the subject. For example, in the sentence <em>"Dat geluid <strong>klinkt</strong> in het midden- en kleinbedrijf en moet worden <strong>gehoord</strong>."</em>, the subject <em>geluid</em> is connected to its head <em>klinkt</em> but not to <em>gehoord</em>. We override this behavior: if a token is the subject, we check whether its head (ROOT) has conjuncts. If so, we consider the conjuncts as the heads of the subject as well. In our example, this means that the subject <em>geluid</em> has two heads [<em>klinkt</em>, <em>gehoord</em>]. Since the dependency length between <em>geluid</em> and <em>gehoord</em> is bigger than the one between <em>geluid</em> and <em>klinkt</em>, we take the former into account for the SDL calculation.</li>
</ul>
<p>These exceptions and corrections were done based on a manual analysis of a sample of 200 sentences performed by Henk Pander Maat, one of the creators of the original LiNT. He identified these 3 issues as the main systematic differences between the spaCy parser and the parser used in the original LiNT.</p>
<h3>Content Words per Clause</h3>
<h4>Why content words per clause?</h4>
<p>A clause is a group of words that contains a subject and a verb. A simple sentence contains one clause; longer sentences may contain additional clauses, for example subordinate clauses or clauses connected with words like "and" or "because". For this metric, the number of clauses is not important; what we analyze is the number of content words <strong>in each clause</strong>.</p>
<p>A clause with a lot of content words is dense in information and is therefore more difficult to process and understand. For example, compare the sentence <em>"Ik verknalde het proefwerk."</em> with the sentence <em>"Ik verknalde het proefwerk Wiskunde gisteren bij het laatste schoolexamen."</em>. In both cases, the sentence contains one clause (one subject and one verb), but in the second sentence there is a lot more information, which is introduced through four extra content words (<em>Wiskunde, gisteren, laatste, schoolexamen</em>).</p>
<h4>Calculating content words per clause</h4>
<p>We calculate the number of clauses in the sentence by counting the number of finite verbs, i.e., verbs that show tense. This is done using the spaCy fine-grained part-of-speech tag "WW|pv" (<em>werkwoord, persoonsvorm</em>).</p>
<p>We claculate the number of content words by counting all words that have the following <a href="https://universaldependencies.org/u/pos/">parts-of-speech (POS)</a>: nouns (NOUN), proper nouns (PROPN), lexical verbs (VERB), adjectives (ADJ). To these, we also add a <a href="https://github.com/vanboefer/lint_ii/blob/main/src/lint_ii/linguistic_data/data/manner_adverbs_20251017.parquet">list of 69 manner adverbs</a>, which we consider content words; other adverbs are not included since they are considered function words, in line with the original LiNT. For more information, see the <a href="https://raw.githubusercontent.com/CentreForDigitalHumanities/tscan/master/docs/tscanhandleiding.pdf">T-Scan manual</a>.</p>
<h3>Proportion of Concrete Nouns</h3>
<h4>Why concrete and abstract nouns?</h4>
<p>Concrete nouns refer to specific, tangible items that can be perceived through the senses, like "apple" or "car". Abstract nouns, on the other hand, represent general ideas or concepts that cannot be physically touched, such as "freedom" or "happiness". Research suggests that a more concrete text is easier to understand; for example, adding examples helps understanding because examples make ideas more specific and concrete.</p>
<h4>LiNT-II noun list</h4>
<p>The <a href="https://github.com/vanboefer/lint_ii/blob/main/src/lint_ii/linguistic_data/data/nouns_sem_types_20251118.parquet">noun list</a> was created for the original LiNT and further revised and updated for LiNT-II. The original annotation work was done by Henk Pander Maat, Nick Dekker and N. van Houten; the revisions and additions for LiNT-II were done by Henk Pander Maat.</p>
<p>The list contains 164,671 nouns, annotated for their semantic type (e.g., "human", "place") and semantic class ("abstract", "concrete", "undefined"); the full annotation scheme is described <a href="#semantic-types-annotation">below</a>. The annotations are based on an existing lexicon -- <a href="https://taalmaterialen.ivdnt.org/download/tstc-referentiebestand-nederlands/">Referentiebestand Nederlands (Martin & Maks 2005)</a> -- which was expanded and revised. For more information about how the original list was created, see the <a href="https://raw.githubusercontent.com/CentreForDigitalHumanities/tscan/master/docs/tscanhandleiding.pdf">T-Scan manual</a>.</p>
<p>Descriptive statistics of the LiNT-II noun list:</p>
<ul>
<li>The list contains 164,671 nouns in total.</li>
<li>The list contains both the <strong>singular</strong> and the <strong>plural</strong> forms of the nouns, unlike the original list, which contained singular forms only. The plural forms were added to improve the coverage, since plurals are not always correctly lemmatized by the <a href="https://spacy.io/models/nl#nl_core_news_lg">spaCy model</a>. There are 85,888 singular forms and 78,783 plural forms in the list.</li>
<li>123,307 nouns in the list are compounds (e.g., <em>duwboot</em> "towboat"). For compounds, the base word (<em>boot</em> "boat") and the modifier (<em>duw</em> "tow") are annotated. This information is used in the word frequency feature, as described <a href="#corrections-and-exceptions">above</a>. For plural form compounds (N=59,911), the base word is given in singular (for example, the base word of both <em>integriteitstoets</em> "integrity test" and <em>integriteitstoetsen</em> "integrity tests" is <em>toets</em> "test").</li>
<li>The distribution of semantic classes in the list is as follows:
<ul>
<li>concrete: 84,144</li>
<li>abstract: 78,018</li>
<li>undefined: 2,509</li>
</ul>
</li>
</ul>
<h4>Semantic types scheme</h4>
<p>The nouns in the list are divided into 14 semantic types (see the table below), which are in turn classified into two classes: <strong>abstract</strong> and <strong>concrete</strong>. Ambiguous words that have both an abstract and a concrete meaning are classified as <strong>undefined</strong>.</p>
<table>
<thead>
<tr>
<th>Semantic class</th>
<th>Semantic type</th>
<th>Examples</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="10">concrete</td>
<td>human</td>
<td><em>economiedocenten</em>, <em>assistent</em></td>
</tr>
<tr>
<td>nonhuman</td>
<td><em>sardine</em>, <em>eik</em></td>
</tr>
<tr>
<td>artefact</td>
<td><em>stoel</em>, <em>barometers</em></td>
</tr>
<tr>
<td>concrete substance</td>
<td><em>modder</em>, <em>lichaamsvloeistoffen</em></td>
</tr>
<tr>
<td>food and care</td>
<td><em>melk</em>, <em>lettertjesvermicelli</em></td>
</tr>
<tr>
<td>measure</td>
<td><em>euro</em>, <em>kwartje</em></td>
</tr>
<tr>
<td>place</td>
<td><em>amsterdam</em>, <em>voorkamer</em></td>
</tr>
<tr>
<td>time</td>
<td><em>kerstavond</em>, <em>periode</em></td>
</tr>
<tr>
<td>concrete event</td>
<td><em>ademhaling</em>, <em>stakingsacties</em></td>
</tr>
<tr>
<td>miscellaneous concrete</td>
<td><em>galblaas</em>, <em>vulkaan</em></td>
</tr>
<tr style="border-top: 1px solid">
<td rowspan="4">abstract</td>
<td>abstract substance</td>
<td><em>fosfor</em>, <em>tumorcellen</em></td>
</tr>
<tr>
<td>abstract event</td>
<td><em>crisis</em>, <em>status-update</em></td>
</tr>
<tr>
<td>organization</td>
<td><em>nato</em>, <em>warenautoriteit</em></td>
</tr>
<tr>
<td>miscellaneous abstract (nondynamic)</td>
<td><em>motto</em>, <em>woordfrequentie</em></td>
</tr>
<tr style="border-top: 1px solid">
<td>undefined</td>
<td><em>ambiguous words that belong to more than one type</em></td>
<td><em>steun</em>, <em>underground</em></td>
</tr>
</tbody>
</table>
<h4>Calculating the proportion of concrete nouns</h4>
<p>We calculate the proportion of concrete nouns in the document as follows:</p>
\[
\frac{N_{\text{concrete}}}{N_{\text{concrete}} + N_{\text{abstract}} + N_{\text{undefined}}}
\]
</section>
<section>
<h2 id="formula-scores-and-difficulty-levels">Formula, Scores and Difficulty Levels</h2>
<h3>Where does LiNT-II Formula Come from?</h3>
<h4>Original LiNT: data and model</h4>
<p>For the development of the original LiNT, an empirical comprehension study was done. In this study, 2700 Dutch high-school students read 120 texts; their understanding of the texts was assessed using a <em>cloze test</em> (fill-in missing words). This comprehension dataset was then used by the researchers to fit a <a href="https://en.wikipedia.org/wiki/Linear_regression">linear regression model</a>; the model expresses which features of the text best predict the students' performance in the cloze test.</p>
<p>The developers of LiNT started with 12 different text features; step-by-step, they eliminated features which were not predictive enough or were highly correlated with other features. By the end of this process, they were left with the 4 features: word frequency, syntactic dependency length, number of content words per clause, and proportion of concrete nouns. These features explain 74% of the variance in the comprehension dataset (Adjusted \( R^2 = 0.74 \)). The regression model assigns each of these features a weight (coefficient) and this is the formula used to asses text readability.</p>
<p>The research on which LiNT is based, including the empirical comprehension study and the development of the model, is described in:</p>
<ul>
<li><a href="https://lint.hum.uu.nl/assets/kleijn-2018.pdf">PhD thesis of Suzanne Kleijn (2018)</a> (English)</li>
<li><a href="https://www.aup-online.com/content/journals/10.5117/TVT2023.3.002.MAAT">Pander Maat et al. 2023</a> (Dutch)</li>
</ul>
<h4>LiNT-II model</h4>
<p>For the development of LiNT-II, we used the same comprehension dataset and the same 4 features as in the original LiNT.</p>
<p>Since LiNT-II uses a different software for the linguistic analysis, the values of the features are different from LiNT; therefore, a new model was fitted on the comprehension data. LiNT-II model performs as well as the original LiNT model: it explains 74% of the variance in the comprehension dataset (Adjusted \( R^2 = 0.74 \)). Below, additional details about the model are shown.</p>
<table class="col-1-l col-2-r col-3-r col-4-r col-5-r col-6-r">
<thead>
<tr>
<th>Parameter</th>
<th>Coefficient</th>
<th>Standardized Coefficient (Beta)</th>
<th>Correlation (zero-order)</th>
<th>Partial Correlation</th>
<th>Variance Inflation Factor</th>
</tr>
</thead>
<tbody>
<tr>
<td>constant</td>
<td>-4.21</td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>word frequency</td>
<td>17.28</td>
<td>0.403</td>
<td>0.72</td>
<td>0.56</td>
<td>1.43</td>
</tr>
<tr>
<td>syntactic dependency length</td>
<td>-1.62</td>
<td>-0.255</td>
<td>-0.66</td>
<td>-0.34</td>
<td>1.99</td>
</tr>
<tr>
<td>content words per clause</td>
<td>-2.54</td>
<td>-0.218</td>
<td>-0.70</td>
<td>-0.28</td>
<td>2.30</td>
</tr>
<tr>
<td>proportion concrete nouns</td>
<td>16.00</td>
<td>0.246</td>
<td>0.56</td>
<td>0.40</td>
<td>1.27</td>
</tr>
</tbody>
</table>
<ul>
<li><em><strong>Standardized Coefficients (Beta)</strong></em> indicate how many standard deviations the dependent variable will change for a one standard deviation change in each independent variable. This makes it easier to compare the relative importance of different predictors.</li>
<li><em><strong>Zero-order Correlation</strong></em>: Simple Pearson correlation between each predictor and the dependent variable (ignoring all other variables).</li>
<li><em><strong>Partial Correlation</strong></em>: Correlation between each predictor and the dependent variable, controlling for all other predictors in the model; i.e., its unique contribution after controlling for other variables.</li>
<li><em><strong>Variance Inflation Factor (VIF)</strong></em> is a measure of multicollinearity; a high VIF (> 5)indicates that the variable is highly correlated with others, which can make the model less reliable and harder to interpret.</li>
</ul>
<h3>LiNT-II Formula & Score</h3>
<p>The readability score is calculated based on the following formula:</p>
\[
\begin{align}
\text{LiNT-II score} = 100 - (
& -\text{4.21} \\
& + \text{17.28} \cdot \text{word frequency} \\
& - \text{1.62} \cdot \text{syntactic dependency length} \\
& - \text{2.54} \cdot \text{content words per clause} \\
& + \text{16.00} \cdot \text{proportion concrete nouns}
)
\end{align}
\]
<h3>LiNT-II Difficulty Levels</h3>
<p>LiNT-II scores are mapped to 4 difficulty levels. For each level, it is estimated how many adult Dutch readers have difficulty understanding texts on this level.</p>
<table class="col-1-r col-2-r col-3-r">
<thead>
<tr>
<th style="width: 10ch;">Score</th>
<th>Difficulty level</th>
<th>Proportion of adults who have diffuculty understanding this level</th>
</tr>
</thead>
<tbody>
<tr>
<td>[0-34)</td>
<td>1</td>
<td>14%</td>
</tr>
<tr>
<td>[34-46)</td>
<td>2</td>
<td>29%</td>
</tr>
<tr>
<td>[46-58)</td>
<td>3</td>
<td>53%</td>
</tr>
<tr>
<td>[58-100]</td>
<td>4</td>
<td>78%</td>
</tr>
</tbody>
</table>
<p>The estimation is done in the same way as for the original LiNT, based on the comprehension dataset. For a detailed explanation, please refer to <a href="https://www.aup-online.com/content/journals/10.5117/TVT2023.3.002.MAAT">Pander Maat et al. 2023</a>.</p>
<h3>Handling missing values</h3>
<p>When the value for at least one of the four <a href="#linguistic-features">features</a> in the formula is
<code>None</code>, LiNT-II score is not calculated (returns <code>None</code>). This happens in the following cases:
</p>
<ul>
<li><code>word frequency</code> is None when (a) there are no content words in the sentence, or (b) all content words are in the <a href="https://github.com/vanboefer/lint_ii/blob/main/src/lint_ii/linguistic_data/data/subtlex_wordfreq_skiplist.parquet">"skip-list"</a>, as explained <a href="#corrections-and-exceptions">here</a>.</li>
<li><code>syntactic dependency length</code> is None when the sentence consists of one word only, excluding punctuation.</li>
<li><code>content words per clause</code> is None if there are no finite verbs in the sentence.</li>
<li><code>proportion concrete nouns</code> is None if there are no nouns or only 'unknown' nouns ('unknown' = not in the <a href="https://github.com/vanboefer/lint_ii/blob/main/src/lint_ii/linguistic_data/data/nouns_sem_types_20251118.parquet">"noun list"</a>) in the sentence.</li>
</ul>
<p>Examples:</p>
<table class="col-2-r col-3-r col-4-r col-5-r">
<thead>
<tr>
<th>sentence</th>
<th><code>word frequency</code></th>
<th><code>syntactic dependency length</code></th>
<th><code>content words per clause</code></th>
<th><code>proportion concrete nouns</code></th>
</tr>
</thead>
<tbody>
<tr>
<td><em>Waarom?</em></td>
<td>None</td>
<td>None</td>
<td>None</td>
<td>None</td>
</tr>
<tr>
<td><em>Waarom is het zo?</em></td>
<td>None</td>
<td>2</td>
<td>0</td>
<td>None</td>
</tr>
<tr>
<td><em>Misschien is het net opgekomen.</em></td>
<td>None<sup>*</sup></td>
<td>3</td>
<td>1</td>
<td>None</td>
</tr>
<tr>
<td><em>Wat rechtsom kan , zou ook linksom moeten kunnen.</em></td>
<td>4.12</td>
<td>5</td>
<td>1.5</td>
<td>None</td>
</tr>
</tbody>
</table>
<p>*<em>opgekomen</em> is a content word, but it is in the <a
href="https://github.com/vanboefer/lint_ii/blob/main/src/lint_ii/linguistic_data/data/subtlex_wordfreq_skiplist.parquet">"skip-list"</a>
</p>
</section>
<section>
<h2 id="references-and-credits">References and Credits</h2>
<h3>LiNT-II</h3>
<p>LiNT-II was developed by <a href="https://www.linkedin.com/in/jeniakim/">Jenia Kim</a> (Hogeschool Utrecht, VU Amsterdam), in collaboration with <a href="https://www.uu.nl/medewerkers/HLWPanderMaat">Henk Pander Maat</a> (Utrecht University).</p>
<p>If you use this library, please cite as follows:</p>
<pre><code>@software{lint_ii,
author = {Kim, Jenia and Pander Maat, Henk},
title = {{LiNT-II: readability assessment for Dutch}},
year = {2025},
url = {https://github.com/vanboefer/lint_ii},
version = {0.1.1},
note = {Python package}
}
</code></pre>
<ul>
<li>Special thanks to <a href="https://www.uu.nl/staff/APJvandenBosch">Antal van den Bosch</a> (Utrecht University) for setting up and facilitating the collaboration.</li>
<li>Special thanks to <a href="https://github.com/lcvriend">Lawrence Vriend</a> for his work on the <strong>LiNT-II Visualizer</strong> and other help with the code.</li>
<li>The code for LiNT-II was inspired by a spaCy implementation of LiNT by the City of Amsterdam: <a href="https://github.com/Amsterdam/alletaal-lint">alletaal-lint</a>.</li>
</ul>
<h3>Original LiNT</h3>
<p>The first version of LiNT was developed in the NWO project <em>Toward a validated reading level tool for Dutch</em> (2012-2017). Later versions were developed in the <em>Digital Humanities Lab</em> of Utrecht University.</p>
<p>More details about the original LiNT can be found on:</p>
<ul>
<li><a href="https://lint.hum.uu.nl/home">LiNT (Utrecht University)</a></li>
<li><a href="https://www.gebruikercentraal.nl/hulpmiddelen/lint-leesbaarheidsinstrument-voor-nederlandse-teksten/">LiNT (Gebruiker Centraal)</a></li>
</ul>
<p>The readability research on which LiNT is based is described in the <a href="https://lint.hum.uu.nl/assets/kleijn-2018.pdf">PhD thesis of Suzanne Kleijn</a> (English) and in <a href="https://www.aup-online.com/content/journals/10.5117/TVT2023.3.002.MAAT">Pander Maat et al. 2023</a> (Dutch). Please cite as follows:</p>
<pre><code>@article{pander2023lint,
title={{LiNT}: een leesbaarheidsformule en een leesbaarheidsinstrument},
author={Pander Maat, Henk and Kleijn, Suzanne and Frissen, Servaas},
journal={Tijdschrift voor Taalbeheersing},
volume={45},
number={1},
pages={2--39},
year={2023},
publisher={Amsterdam University Press Amsterdam}
}
</code></pre>
<pre><code>@phdthesis{kleijn2018clozing,
title={Clozing in on readability: How linguistic features affect and predict text comprehension and on-line processing},
author={Kleijn, Suzanne},
year={2018},
school={Utrecht University}
}
</code></pre>
</section>
<footer></footer>
<script type="module" src="./src/visualizer/lint_ii_visualizer.js"></script>
<script type="module">
const viz = document.getElementById('viz')
const radios = document.querySelectorAll('input[name="text_examples"]')
const loadLevel = (level) => {
viz.loadFromUrl(`./examples_for_visualizer/${level}.json`)
}
radios.forEach(radio => {
radio.addEventListener('change', (e) => {
if (e.target.checked) loadLevel(e.target.value)
})
})
// Initialize with checked radio
const checked = document.querySelector('input[name="text_examples"]:checked')
if (checked) loadLevel(checked.value)
</script>
<script>
document.addEventListener("color-scheme-change", event => {
event.detail.scheme === "dark"
? document.body.classList.add("latex-dark")
: document.body.classList.remove("latex-dark")
const viz = document.getElementById('viz')
const statsContainer = viz.shadowRoot?.querySelector('[data-view="stats"]')
if (statsContainer && statsContainer.children.length > 0) {
viz.renderStats()
}
})
</script>
<script>
const showOnPx = 100
const backToTopButton = document.querySelector(".back-to-top")
const scrollContainer = () => {
return document.documentElement ?? document.body
}
const goToTop = () => {
document.body.scrollIntoView({
behavior: "smooth",
})
}
document.addEventListener("scroll", () => {
if (scrollContainer().scrollTop > showOnPx) {
backToTopButton.classList.remove("hidden")
} else {
backToTopButton.classList.add("hidden")
}
})
backToTopButton.addEventListener("click", goToTop)
</script>
</body>
</html>