hanproject/annotator_comparison.py at main · hanproj/hanproject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#! C:\Python36\
# -*- encoding: utf-8 -*-
from PyQt5 import QtWidgets, uic
#from PyQt5 import QtGui.QColor
from PyQt5.QtGui import QColor

#from PyQt5.QtCore import
#QtWidgets.Qcol
import sys
import os
import re
import copy
from soas_network_utils import get_hanproj_dir
from soas_network_utils import readlines_of_utf8_file
from soas_rnetwork_test import delete_file_if_it_exists
from soas_network_utils import append_line_to_output_file
#from soas_network_utils import append_line_to_utf8_file
from soas_imported_from_py3 import append_line_to_utf8_file
from soas_network_utils import is_hanzi
from soas_network_utils import get_rhyme_groups_from_annotated_poem
from soas_network_utils import exception_chars
from soas_network_utils import if_file_exists
from soas_network_utils import readin_most_complete_schuessler_data
from soas_network_utils import is_poem_annotated
from soas_network_utils import does_line_have_rhyme_marker
from soas_network_utils import get_rhyme_word_and_marker_from_line_of_poem

#rw, m = get_rhyme_word_and_marker_from_line_of_poem(p)

#from soas_network_utils import readin_results_of_community_detection
from soas_network_utils import readin_community_detection_group_descriptions

def debug_msg(msg, origin, do_print_msg=False):
    output = ''
    if do_print_msg:
        if origin:
            output = origin + ' '
        output += msg
        print(output)

def file_does_exist(filename):
    retval = False
    if os.path.isfile(filename):
        retval = True
    return retval

def get_received_shi_naively_annotated_file():
    return os.path.join(get_hanproj_dir(), 'received-shi', 'naively_annotated_Lu_1983_先秦漢魏晉南北朝詩.txt')

def get_received_shi_com_det_annotated_file():
    return os.path.join(get_hanproj_dir(), 'received-shi', 'com_det_annotated_Lu_1983_先秦漢魏晉南北朝詩.txt')

def get_received_shi_schuessler_annotated_file():
    return os.path.join(get_hanproj_dir(), 'received-shi', 'schuessler_annotated_Lu_1983_先秦漢魏晉南北朝詩.txt')

def get_mirors_naively_annotated_file():
    return os.path.join(get_hanproj_dir(), 'mirrors', 'naively_annotated_kyomeishusei2015_han_mirror_data.txt')

def get_mirors_schuessler_annotated_file():
    return os.path.join(get_hanproj_dir(), 'mirrors', 'schuessler_annotated_kyomeishusei2015_han_mirror_data.txt')

def get_mirrors_com_det_annotated_file():
    return os.path.join(get_hanproj_dir(), 'mirrors', 'com_det_annotated_kyomeishusei2015_han_mirror_data.txt'
                        )
def get_stelae_naively_annotated_file():
    return os.path.join(get_hanproj_dir(), 'stelae', 'naively_annotated_毛遠明 《漢魏六朝碑刻校注》.txt')

def get_stelae_schuessler_annotated_file():
    return os.path.join(get_hanproj_dir(), 'stelae', 'schuessler_annotated_毛遠明 《漢魏六朝碑刻校注》.txt')

def get_stelae_com_det_annotated_file():
    return os.path.join(get_hanproj_dir(), 'stelae', 'com_det_annotated_毛遠明 《漢魏六朝碑刻校注》.txt')

def readin_received_shi_naively_annotated_data():
    return readin_received_shi_data(get_received_shi_naively_annotated_file())

def readin_received_shi_schuessler_annotated_data():
    return readin_received_shi_data(get_received_shi_schuessler_annotated_file())

def readin_received_shi_com_det_annotated_data():
    return readin_received_shi_data(get_received_shi_com_det_annotated_file())

#kyomeishusei2015_00001: 脩相思、毌相a忘。常樂未a央。
#kyomeishusei2015_00002: 脩相思、願毌相a忘。大樂未a央。
#kyomeishusei2015_00003: 安樂未a央。脩相思、願毌相a忘。
#kyomeishusei2015_00004: 常樂未a央。脩相思、願毌相a忘。
def readin_mirrors_naively_annotated_data():
    funct_name = 'readin_mirrors_naively_annotated_data()'
    return readin_mirrors_data(get_mirors_naively_annotated_file())

def readin_mirrors_schuessler_annotated_data():
    return readin_mirrors_data(get_mirors_schuessler_annotated_file())

def readin_mirrors_com_det_annotated_data():
    return readin_mirrors_data(get_mirrors_com_det_annotated_file())

#
# TO DO:
# - ensure that the mirrors data is consistent for each type: com_det, naive, schuessler
def readin_mirrors_data(filename):
    #return '' # debug only --remove
    raw_data = readlines_of_utf8_file(filename)
    retval = {}
    for rd in raw_data:
        poem_num = rd.split('.')
        if len(poem_num) <= 4:
            poem_num = poem_num[1]
        else:
            continue
        poem_num = poem_num.split(':')#[0])
        poem_only = poem_num[1]
        if rd.split():
            poem = rd#
        else:
            continue
        poem_num = int(poem_num[0])
#        if poem.strip():
#            print('poem_num = ' + str(poem_num) + ': ' + poem)
        if poem_num not in retval:
            retval[poem_num] = []
        if poem_only.strip():
            retval[poem_num].append(poem)
    return retval

def readin_stelae_naively_annotated_data():
    return readin_stelae_data(get_stelae_naively_annotated_file())

def readin_stelae_schuessler_annotated_data():
    return readin_stelae_data(get_stelae_schuessler_annotated_file())

def readin_stelae_com_det_annotated_data():
    return readin_stelae_data(get_stelae_com_det_annotated_file())

#
#
# TO DO:
#   Unify the output of the annotators; they aren't the same and it's trashing everything
#
def readin_stelae_data(filename):
    raw_data = readlines_of_utf8_file(filename)
    retval = {}
    for rd in raw_data:
        poem_num = rd.split('.')
        if len(poem_num) >= 2:
            poem_num = int(poem_num[1])
        else:
            continue
        poem_only = rd.split('： ')
        try:
            poem_only = poem_only[1]
        except IndexError as ie:
            x = 1
        if rd.split():
            poem = rd#
        else:
            continue
        #poem_num = int(poem_num[0])
        if poem_num not in retval:
            retval[poem_num] = []
        if poem_only.strip():
            retval[poem_num].append(poem)
    return retval

#
# TO DO:
# 1) fix the 'prev' 'next' buttons: Lu1983 data goes from 1 - 541.
#    PROBLEM: When you hit 'prev' at 1, it goes back to 539

def readin_received_shi_data(filename):
    raw_data = readlines_of_utf8_file(filename)
    #Lu1983.001.1.1： 大風起兮雲飛揚。
    #Lu1983.001.1.2： 威加海內兮歸故a鄉。
    #Lu1983.001.1.3： 安得猛士兮守四方。
    #Lu1983.002.1.1： 鴻鵠高飛。
    #Lu1983.002.1.2： 一舉千a里。
    #Lu1983.002.1.3： 羽翼就。
    #Lu1983.002.1.4： 橫絕四a海。
    retval = {}
    for rd in raw_data:
        poem_num = int(rd.split('.')[1])
        if poem_num not in retval:
            retval[poem_num] = []
        retval[poem_num].append(rd)
    return retval

#def readin_received_shi_schuessler_annotated_data():
#    return readlines_of_utf8_file(get_received_shi_schuessler_annotated_file())
# data_type
#    the type of physical medium the poetry comes from
#    received_shi
#    mirrors
#    stelae
# annotation_type
#    the type of analysis done on the data
#    naive
#    com_det
#    schuessler
class poem_array:
#    def __init__(self, annotation_type, data_type):
#        self.set_data(annotation_type,data_type)
    def __init__(self):
        self.zero_out_data()

    def zero_out_data(self):
        self.annotation_type = ''
        self.data_type = ''
        self.poem_array = {}
        self.indicies = []
        self.current_poem = 1

    def is_data_empty(self):
        retval = False
        if not self.poem_array:
            retval = True
        return retval

    def set_data(self, annotation_type, data_type):
        self.zero_out_data()
        self.annotation_type = annotation_type
        self.data_type = data_type
        if annotation_type == 'naive':
            if 'received_shi' in data_type:
                self.poem_array = readin_received_shi_naively_annotated_data()
            elif 'mirrors' in data_type:
                self.poem_array = readin_mirrors_naively_annotated_data()
            elif 'stelae' in data_type:
                self.poem_array = readin_stelae_naively_annotated_data()
        elif annotation_type == 'schuessler':
            if 'received_shi' in data_type:
                self.poem_array = readin_received_shi_schuessler_annotated_data()
            elif 'mirrors' in data_type:
                self.poem_array = readin_mirrors_schuessler_annotated_data()
            elif 'stelae' in data_type:
                self.poem_array = readin_stelae_schuessler_annotated_data()
        elif annotation_type == 'com_det':
            if 'received_shi' in data_type:
                self.poem_array = readin_received_shi_com_det_annotated_data()
            elif 'mirrors' in data_type:
                self.poem_array = readin_mirrors_com_det_annotated_data()
            elif 'stelae' in data_type:
                self.poem_array = readin_stelae_com_det_annotated_data()

        self.indicies = list(self.poem_array.keys())# contiguous
        self.current_poem = self.indicies[0]

    def increment_to_next_poem(self):
        current_ind = self.indicies.index(self.current_poem)
        if current_ind == len(self.indicies) - 1:  # if this is the highest index
            self.current_poem = self.indicies[0]  # then wrap back to beginning of poems
        elif current_ind < len(self.indicies) - 1:  #
            self.current_poem = self.indicies[current_ind + 1]

    def get_next_poem(self):
        self.increment_to_next_poem()
        return self.get_current_poem()

    def decrement_to_prev_poem(self):
        current_ind = self.indicies.index(self.current_poem)
        if current_ind == 0:  # if this is the lowest index
            self.current_poem = self.indicies[len(self.indicies) - 1]  # then wrap to last poem (one with highest index)
        elif current_ind > 0:
            self.current_poem = self.indicies[current_ind - 1]

    def get_prev_poem(self):
        self.decrement_to_prev_poem()
        return self.get_current_poem()

    def get_current_poem(self):
        retval = ''
        if self.current_poem in self.indicies:
            retval = self.poem_array[self.current_poem]
        return retval

    def is_poem_number_in_array(self, poem_num):
        retval = False
        if poem_num in self.indicies:
            retval = True
        return retval

    def get_poem_num(self, poem_num):
        retval = ''
        if self.is_poem_number_in_array(poem_num):
            poem_ind = self.indicies.index(poem_num)
            retval = self.poem_array[poem_ind]
        return retval

class poem_data:
    def __init__(self, data_type):
        self.data_type = data_type
        self.reset()

    def reset(self):
        self.do_print_msg = True
        self.class_name = 'poem_data'
        self.schuessler = poem_array()
        self.naive = poem_array()
        self.com_det = poem_array()
        #self.stelae_indicies = []
        #self.current_poem = 1

    def new_data_type(self, data_type):
        self.reset()
        self.data_type = data_type
        self.naive.set_data('naive', data_type)
        self.com_det.set_data('com_det', data_type)
        self.schuessler.set_data('schuessler')
    def get_total_num_poems(self):
        retval = 0
        if self.naive:
            retval = len(self.naive)
        elif self.schuessler:
            retval = len(self.schuessler)
        elif self.com_det:
            retval = len(self.com_det)
        return retval

    def increment_to_next_poem(self):
        if self.naive:
            self.naive.increment_to_next_poem()
        if self.com_det:
            self.com_det.increment_to_next_poem()
        if self.schuessler:
            self.schuessler.increment_to_next_poem()

        #total_num = self.get_total_num_poems()
        #if total_num > 0:
        #    if self.current_poem + 1 <= total_num:
        #        self.current_poem += 1
        #    else:
        #        self.current_poem = 1

    # self.current_poem refers to realworld data, so it goes from 1 -> N
    # the poem dictionaries also go from 1 -> N
    def decrement_to_prev_poem(self):
        if self.naive:
            self.naive.decrement_to_prev_poem()
        if self.com_det:
            self.com_det.decrement_to_prev_poem()
        if self.schuessler:
            self.schuessler.decrement_to_prev_poem()


        #total_num = self.get_total_num_poems() # should be N-1 (since the poem dictionaries start at 1)
        #if total_num > 0:
        #    if 'stelae' in self.data_type: # goes from 0 to N-1
        #        if self.current_poem - 1 < 0:
        #            self.current_poem = total_num
        #        else:
        #            self.current_poem -= 1
        #    else: # normal case
        #        if self.current_poem == 1:
        #            self.current_poem = total_num
        #        else:
        #            self.current_poem -= 1

    def get_current_naive_poem(self):
        funct_name = 'get_current_naive_poem()'
        retval = []
        if self.naive.is_data_empty():
            self.naive.set_data('naive', self.data_type)
        retval = self.naive.get_current_poem()

            #if 'received_shi' in self.data_type:
            #    self.naive = readin_received_shi_naively_annotated_data()
            #elif 'mirrors' in self.data_type:
            #    self.naive = readin_mirrors_naively_annotated_data()
            #elif 'stelae' in self.data_type:
            #    self.naive = readin_stelae_naively_annotated_data()
            #    self.stelae_indicies = list(self.naive.keys())
            #else:
            #    origin = self.class_name + '::' + funct_name
            #    debug_msg('Unsupported data type: ' + self.data_type, origin, self.do_print_msg)
        #if 'stelae' in self.data_type:
        #    if self.current_poem < len(self.stelae_indicies):
        #        retval = self.naive[self.stelae_indicies[self.current_poem]]
        #elif self.current_poem in self.naive:
        #    retval = self.naive[self.current_poem]

        return retval

    def get_current_com_det_poem(self):
        funct_name = 'get_current_com_det_poem()'
        retval = []
        if self.com_det.is_data_empty():
            self.com_det.set_data('com_det', self.data_type)
        retval = self.com_det.get_current_poem()

            #if 'received_shi' in self.data_type:
            #    self.com_det = readin_received_shi_com_det_annotated_data()

            #elif 'mirrors' in self.data_type:
            #    self.com_det = readin_mirrors_com_det_annotated_data()
            #elif 'stelae' in self.data_type:
            #    self.com_det = readin_stelae_com_det_annotated_data()
            #else:
            #    origin = self.class_name + '::' + funct_name
            #    debug_msg('Unsupported data type: ' + self.data_type, origin, self.do_print_msg)

        #if 'stelae' in self.data_type:
        #    if self.current_poem < len(self.stelae_indicies):
        #        retval = self.com_det[self.stelae_indicies[self.current_poem]]
        #elif self.current_poem in self.com_det:
        #    retval = self.com_det[self.current_poem]

        return retval

    def get_current_schuessler_poem(self):
        funct_name = 'get_current_schuessler_poem()'
        retval = []
        if self.schuessler.is_data_empty():
            self.schuessler.set_data('schuessler', self.data_type)
        retval = self.schuessler.get_current_poem()

        #if not self.schuessler:
        #    if 'received_shi' in self.data_type:
        #        self.schuessler = readin_received_shi_schuessler_annotated_data()
        #    elif 'mirrors' in self.data_type:
        #        self.schuessler = readin_mirrors_schuessler_annotated_data()
        #    elif 'stelae' in self.data_type:
        #        self.schuessler = readin_stelae_schuessler_annotated_data()
        #        return []
        #        #self.stelae_indicies = list(self.schuessler.keys())
        #    else:
        #        origin = self.class_name + '::' + funct_name
        #        debug_msg('Unsupported data type: ' + self.data_type, origin, self.do_print_msg)
        #if 'stelae' in self.data_type:
        #    if self.current_poem < len(self.stelae_indicies):
        #        retval = self.schuessler[self.stelae_indicies[self.current_poem]]
        #elif self.current_poem in self.schuessler:
        #    retval = self.schuessler[self.current_poem]

        return retval

class state_memory:
    def __init__(self, filename):
        self.file = filename

    def remember_state(self, state):
        delete_file_if_it_exists(self.file)
        append_line_to_output_file(self.file, state)

    def recall_state(self):
        return readlines_of_utf8_file(self.file)[0]


class Ui(QtWidgets.QMainWindow):
    def __init__(self):
        super(Ui, self).__init__()
        ui_file = os.path.join(get_hanproj_dir(), 'annotator_comparison', 'annotator_comparison2.ui')
        if not file_does_exist(ui_file):
            debug_msg('ERROR: bad filename - ' + ui_file, 'Ui::__init__()', self.print_debug_msgs)
            return
        uic.loadUi(ui_file, self)
        self.class_name = 'Ui'
        self.print_debug_msgs = True
        self.do_print_debug_msg = True
        self.data_type_gb_memory = state_memory(self.get_data_type_gb_memory_filename())
        radio_button_name = self.data_type_gb_memory.recall_state()
        if not radio_button_name:
            radio_button_name = 'received_shi_rb'
        self.set_data_type_gb_rb_as_checked(radio_button_name)

        self.initialize_data_type_gb_connect_functions()
#        self.initialize_data_type_gb()
        self.p_data = poem_data(self.data_type_gb_memory.recall_state())
        self.initialize_buttons()
        self.load_current_poem()

        self.show()

    def get_data_type_gb_memory_filename(self):
        return os.path.join(get_hanproj_dir(), 'annotator_comparison', 'data_type_gb_memory.txt')

    def set_data_type_gb_rb_as_checked(self, rb_name):
        funct_name = 'set_data_type_gb_rb_as_checked()'
        if rb_name == 'received_shi_rb':
            self.received_shi_rb.setChecked(True)#x = 1
        elif rb_name == 'stelae_rb':
            self.stelae_rb.setChecked(True)
        elif rb_name == 'mirrors_rb':
            self.mirrors_rb.setChecked(True)
        else:
            msg = ' 尷尬 ERROR: rb_name should not be ' + rb_name + '!'
            origin = self.class_name + '::' + funct_name
            debug_msg(msg, origin, self.do_print_debug_msg)
        self.data_type_gb_memory.remember_state(rb_name)

    def next_button_push(self):
        self.p_data.increment_to_next_poem()
        self.load_current_poem()

    def prev_button_push(self):
#        debug_msg('prev_button_push() used, but not yet defined', 'prev_button_push', True)
        self.p_data.decrement_to_prev_poem()
        self.load_current_poem()

    def initialize_buttons(self):
        self.prev_button.clicked.connect(self.prev_button_push)
        self.next_button.clicked.connect(self.next_button_push)

    def initialize_data_type_gb_connect_functions(self):
        self.received_shi_rb.toggled.connect(self.on_selected)
        self.stelae_rb.toggled.connect(self.on_selected)
        self.mirrors_rb.toggled.connect(self.on_selected)

    def get_current_data_type(self):
        return self.data_type_gb_memory.recall_state()

    def on_selected(self):
        radio_button = self.sender()
        if radio_button.isChecked():
            self.print_debug_msg("You have selected : " + radio_button.text())
            name = radio_button.objectName()
            self.data_type_gb_memory.remember_state(name)
            self.p_data.new_data_type(name)
            self.load_current_poem()

    def print_debug_msg(self, msg):
        self.textEdit.setText(msg)

    # data_type = 'naive', 'com_det' or 'schuessler'
    def get_rhyme_groups_from_poem(self, poem, line_delim='\n'):
        funct_name ='get_rhyme_groups_from_poem()'
        return get_rhyme_groups_from_annotated_poem(poem, line_delim)

    def load_current_poem(self):
        n_current_poem = self.p_data.get_current_naive_poem()
        #self.naive_annotator_te.setText('\n'.join(n_current_poem))
        self.add_poem_to_textEdit('\n'.join(n_current_poem), 'naive')
        n_marker2rw_list = self.get_rhyme_groups_from_poem('\n'.join(n_current_poem))

        cd_current_poem = self.p_data.get_current_com_det_poem()
        #self.com_det_annotator_te.setText('\n'.join(cd_current_poem))
        self.add_poem_to_textEdit('\n'.join(cd_current_poem), 'com_det')
        cd_marker2rw_list = self.get_rhyme_groups_from_poem('\n'.join(cd_current_poem))

        s_current_poem = self.p_data.get_current_schuessler_poem()#readin_received_shi_schuessler_annotated_data()
        #self.schuessler_annotator_te.setText('\n'.join(s_current_poem))
        self.add_poem_to_textEdit('\n'.join(s_current_poem), 'schuessler')
        s_marker2rw_list = self.get_rhyme_groups_from_poem('\n'.join(s_current_poem))

        n2cd_similarity = self.get_percentage_similarity_to_naive_annotator(cd_marker2rw_list, len(n_marker2rw_list['a']))
        n2s_similarity = self.get_percentage_similarity_to_naive_annotator(s_marker2rw_list, len(n_marker2rw_list['a']))
        self.update_n2s_tb(n2s_similarity)
        self.update_n2c_tb(n2cd_similarity)

    def update_n2s_tb(self, content):
        self.n2s_tb.clear()
        if content.strip():
            self.n2s_tb.setText(content)

    def update_n2c_tb(self, content):
        self.n2c_tb.clear()
        if content.strip():
            self.n2c_tb.setText(content)

    def update_s2c_tb(self, content):
        self.s2c_tb.clear()
        if content.strip():
            self.s2c_tb.setText(content)
    #
    # purpose:
    #   to calculate the % similarity to the naive annotator for the schuessler and com det annotators
    # INPUT:
    #   marker2rw_list is a dictionary where key = rhyme group marker, value = rhyme word list for that marker
    def get_percentage_similarity_to_naive_annotator(self, marker2rw_list, num_naive_rhymes):
        funct_name = 'get_percentage_similarity_to_naive_annotator()'
        #
        # Step 1: find the marker with the most rhyme words
        max_num_rws = -1
        marker_with_max = ''
        for m in marker2rw_list:
            if len(marker2rw_list[m]) > max_num_rws:
                max_num_rws = len(marker2rw_list[m])
                marker_with_max = m

        # Step 2: Calculate % similarity
        return str(int(round(100.0*float(max_num_rws)/float(num_naive_rhymes))))

    def add_poem_to_textEdit(self, poem, annotator_type):
        red_color = QColor(255, 0, 0)
        black_color = QColor(0, 0, 0)
        annotator = ''
        if annotator_type == 'naive':
            annotator = self.naive_annotator_te
        elif annotator_type == 'com_det':
            annotator = self.com_det_annotator_te
        elif annotator_type == 'schuessler':
            annotator = self.schuessler_annotator_te
        annotator.setText('')
        color = black_color
        poem = poem.split('\n')
        for line in poem: # line = 'Lu1983.016.1.2： 驂駕駟馬從梁α來'
            line = line.split('： ')
            left_side = line[0] # 'Lu1983.016.1.2： '
            annotator.setTextColor(black_color)
            annotator.insertPlainText(left_side)
            color_added = get_colors_for_poem_line(line[1])
            for ca in color_added:
                if ca[1] == 'red':
                    color = red_color
                elif ca[1] == 'black':
                    color = black_color
                annotator.setTextColor(color)
                annotator.insertPlainText(ca[0])
            annotator.insertPlainText('\n')

def trash_test_function():
    x = '總領從官柏梁a台' #'總a領從官柏梁κc台'
    colors = get_colors_for_poem_line(x)
    print('INPUT: ' + x)
    print('OUTPUT: ')
    for sect in colors:
        print(sect[1] + ': ' + sect[0])

def get_colors_for_poem_line(line, is_verbose=False):
    black_text = ''
    red_text = ''
    color_change = False
    retval = []
    black_text = []
    red_text = []
    prev_color = ''
    current_color = ''
    punctuation = ['。', '、', '，']
    for inc in range(0, len(line), 1):
#        print('(' + str(inc) + ') ' + line[inc])
        if is_hanzi(line[inc]) or line[inc] in punctuation:
            if not prev_color:
                prev_color = 'black'
            current_color = 'black'
            black_text.append(line[inc])
            if current_color != prev_color:
                if red_text:
                    retval.append((''.join(red_text), 'red'))
                    red_text = []
                prev_color = 'black'
        else:
            if not prev_color:
                prev_color = 'red'
            current_color = 'red'
            red_text.append(line[inc])
            if current_color != prev_color:
                if black_text:
                    retval.append((''.join(black_text), 'black'))
                    black_text = []
                prev_color = 'red'
        if inc == len(line) - 1:
            if black_text:
                retval.append((''.join(black_text), 'black'))
                black_text = []
            if red_text:
                retval.append((''.join(red_text), 'red'))
                red_text = []
    if is_verbose:
        print('INPUT: ' + line)
        print('OUTPUT: ')
        for sect in retval:
            print(sect[1] + ': ' + sect[0])

    return retval

#
# NOTE:
# this needs to be changed, but it the code that writes out the files need to be changed first.
# Write a function that takes annotator_type and data_type as input, and returns the filename:
#  {annotator_type}_annotated_{data_type}_poem_data.txt
an_poem2filename = {}
an_poem2filename['received_shi'] = {'naive':'naively_annotated_Lu_1983_先秦漢魏晉南北朝詩.txt',
                            'schuessler':'schuessler_annotated_Lu_1983_先秦漢魏晉南北朝詩.txt',
                            'com_det':'com_det_annotated_received-shi_data.txt'}
                            #'com_det':'com_det_annotated_Lu_1983_先秦漢魏晉南北朝詩.txt'}
an_poem2filename['mirrors'] =  {'naive':'naively_annotated_kyomeishusei2015_han_mirror_data.txt',
                       'schuessler':'schuessler_annotated_kyomeishusei2015_han_mirror_data.txt',
                        'com_det':'com_det_annotated_mirrors_data.txt'}
                       #'com_det':'com_det_annotated_kyomeishusei2015_han_mirror_data.txt'}
an_poem2filename['stelae'] = {'naive':'naively_annotated_毛遠明 《漢魏六朝碑刻校注》.txt',
                      'schuessler':'schuessler_annotated_毛遠明 《漢魏六朝碑刻校注》.txt',#
                      'com_det':'com_det_annotated_stelae_data.txt'}
                      #'com_det':'com_det_annotated_毛遠明 《漢魏六朝碑刻校注》.txt'}
def get_annotated_poem_data_filename(annotator_type, data_type):
    funct_name = 'get_annotated_poem_data_filename()'
    retval = ''
    try:
        data_dir = data_type
        if data_type == 'received_shi':
            data_dir = data_dir.replace('_', '-')
        retval = os.path.join(get_hanproj_dir(), data_dir, an_poem2filename[data_type][annotator_type])
    except:
        x = 1
    return retval
# naively_annotated_Lu_1983_先秦漢魏晉南北朝詩.txt
# schuessler_annotated_Lu_1983_先秦漢魏晉南北朝詩.txt
# com_det_annotated_Lu_1983_先秦漢魏晉南北朝詩.txt
def readin_annotated_poem_data(annotator_type, data_type):
    funct_name = 'readin_annotator_data()'
    retval = {}
    filename = get_annotated_poem_data_filename(annotator_type, data_type)
    if not if_file_exists(filename, funct_name):
        return retval
    data = readlines_of_utf8_file(filename)
    for d in data:
        #print(d)
        #continue
        splitter = ''
        if '：' in d:
            splitter = '： '
        elif ':' in d:
            splitter = ': '
        try:
            d = d.split(splitter)
        except ValueError:
            x = 1
        try:
            retval[d[0]] = d[1]
        except IndexError as ie:
            x = 1
    return retval

def test_readin_annotated_poem_data():
    funct_name = 'test_readin_annotated_poem_data()'
    #data = readin_annotated_poem_data('naive', 'received_shi')
    #data = readin_annotated_poem_data('schuessler', 'received_shi')
    data = readin_annotated_poem_data('com_det', 'received_shi')
    for k in data:
        print(k + '= ' + data[k])
    print(str(len(data)) + ' elements.')

def test_compare_data_sets():
    funct_name = 'test_compare_data_sets()'
    data_type = 'stelae' #'stelae'#'received_shi'
    compare_annotation_between_different_annotators(data_type)

def get_poem_base_id2poem_content_dict(poem_id2poem_line_d):
    funct_name = 'get_poem_base_id2poem_content_dict()'
    base_id = ''
    prev_base_id = ''
    poem = []
    data = poem_id2poem_line_d
    retval = {}
    for poem_id in data:
        poem_line = data[poem_id]
        poem_id = poem_id.split('.')
        base_id = poem_id[0] + '.' + poem_id[1]
        if base_id == prev_base_id:  # if we are still in the same poem
            poem.append(poem_line)
        else:  # we are entering a new poem
            # print previous poem
            #print(prev_base_id + ':\n\t' + '\n\t'.join(poem))
            if prev_base_id:
                retval[prev_base_id] = '\n'.join(poem)
            #print('*-' * 20)
            prev_base_id = base_id
            poem = []
            poem.append(poem_line)
    # grab the last item
    retval[prev_base_id] = '\n'.join(poem)
    #print('*-' * 20)
    return retval

def renumber_poems(data, data_type):
    if data_type == 'mirrors' or data_type == 'stelae':
        temp_d = {}
        for poem_id in data:
            new_poem_id = poem_id.split('.')
            new_poem_id = new_poem_id[0] + '.' + new_poem_id[1]
            if new_poem_id not in temp_d:
                temp_d[new_poem_id] = ''
            temp_d[new_poem_id] += data[poem_id]
        return temp_d
    return data

def compare_annotation_between_different_annotators(data_type):
    funct_name = 'compare_annotation_between_different_annotators()'
    n_data = readin_annotated_poem_data('naive', data_type)
    s_data = readin_annotated_poem_data('schuessler', data_type)
    cd_data = readin_annotated_poem_data('com_det', data_type)
    poems_accounted_for = [] # debug
    if data_type == 'received_shi':
        n_data = get_poem_base_id2poem_content_dict(n_data) # id2poem_d
        s_data = get_poem_base_id2poem_content_dict(s_data) # id2poem_d
        cd_data = get_poem_base_id2poem_content_dict(cd_data) # id2poem_d
        data_type4file = 'received-shi'
    else:
        data_type4file = data_type

    output_file = os.path.join(get_hanproj_dir(), data_type4file, 'annotator_comparison_for_' + data_type + '.txt')
    delete_file_if_it_exists(output_file)
    n_equal_s = False
    n_equal_cd = False
    s_equal_cd = False
    not_equal = '≠'
    delim = '\t'
    num_rhymes2data_d = {}
    result2num_instances_d = {}
    result2poem_id = {}
    r2n_index = ''
    no_rhymes = []
    s_data = renumber_poems(s_data, data_type)
    n_data = renumber_poems(n_data, data_type)
    cd_data = renumber_poems(cd_data, data_type)
    irregular_poems = []
    poem_line_delim = '。'
    for poem_id in s_data:
        s_poem_content = s_data[poem_id]
        if '。' not in s_poem_content:
            x = 1
        else:
            x = 1
        if '00058' in poem_id:
            x = 1
        if 'Mou2008.118.1.3' in poem_id:
            x = 1
        if '01925' in poem_id:
            x = 1
        # get_rhyme_groups_from_annotated_poem() # per poem; input = poem content; returns: dict[marker] = [rw words]
        s_m2rw_words_d = get_rhyme_groups_from_annotated_poem(s_poem_content, poem_line_delim)
        n_poem_content = n_data[poem_id]
        n_m2rw_words_d = get_rhyme_groups_from_annotated_poem(n_poem_content, poem_line_delim)
        cd_poem_content = cd_data[poem_id]
        cd_m2rw_words_d = get_rhyme_groups_from_annotated_poem(cd_poem_content, poem_line_delim)
        if not n_m2rw_words_d:
            line_out = 'N/A\tN/A\tN/A\t0'
            no_rhymes.append(poem_id)
            continue
        try:
            num_naive_rhymes = len(n_m2rw_words_d['a'])
        except KeyError as ke:
            x = 1
            irregular_poems.append(poem_id)
            continue
        s2n_percent = get_percentage_similarity_to_naive_annotator(s_m2rw_words_d, num_naive_rhymes)  # per poem
        cd2n_percent = get_percentage_similarity_to_naive_annotator(cd_m2rw_words_d, num_naive_rhymes)
        line_out = poem_id + delim
        if s2n_percent == '100':
            #n_equal_s = True
            line_out += 'N=S' + delim
            r2n_index += 'N=S; '
        else:
            line_out += 'N≠S (' + s2n_percent + ')' + delim
            r2n_index += 'N≠S; '
        if cd2n_percent == '100':
            line_out += 'N=C' + delim
            r2n_index += 'N=C; '
        else:
            line_out += 'N≠C (' + cd2n_percent + ')' + delim
            r2n_index += 'N≠C; '
        if s2n_percent == cd2n_percent:
            if s2n_percent == '100':
                line_out += 'S=C' + delim
                r2n_index += 'S=C; '
            else:
                s_equal_c = are_these_two_marker2rhyme_word_dictionaries_equal(s_m2rw_words_d, cd_m2rw_words_d)
                if s_equal_c:
                    line_out += 'S=C' + delim
                    r2n_index += 'S=C; '
                else:
                    line_out += 'S≠C' + delim
                    r2n_index += 'S≠C; '
        else:
            line_out += 'S≠C' + delim
            r2n_index += 'S≠C; '
        if r2n_index not in result2num_instances_d:
            result2num_instances_d[r2n_index] = []
        result2num_instances_d[r2n_index].append(num_naive_rhymes)

        if r2n_index not in result2poem_id:
            result2poem_id[r2n_index] = []
        result2poem_id[r2n_index].append(poem_id)
        poems_accounted_for.append(poem_id)  # debug
        r2n_index = ''
        line_out += str(num_naive_rhymes)
        if num_naive_rhymes not in num_rhymes2data_d:
            num_rhymes2data_d[num_naive_rhymes] = []
        num_rhymes2data_d[num_naive_rhymes].append(line_out)
        #print(poem_id + ': ' + line_out)
        max_k = -1
        for k in num_rhymes2data_d:
            if k > max_k:
                max_k = k
        #print('max_k=' + str(max_k))
    if 1:
        #for k in num_rhymes2data_d:
        for k in range(max_k, 1, -1):
            if k in num_rhymes2data_d:
                print('#Rhymes = ' + str(k))
                append_line_to_utf8_file(output_file, '#Rhymes = ' + str(k))
                for line in num_rhymes2data_d[k]:
                    print('\t' + line)
                    append_line_to_utf8_file(output_file, '\t' + line)
        print('*-' * 20)
        print('\t\tFor ' + data_type.capitalize() + ' data.')
        print('*-' * 20)
        for r2n_index in result2num_instances_d:
            msg = r2n_index + ' has ' + str(len(result2num_instances_d[r2n_index])) + ' instances, for a total of '
            msg += str(sum(result2num_instances_d[r2n_index])) + ' rhymes.'
            print(msg)
        print('*-' * 20)
        #result2poem_id[r2n_index].append(poem_id)
        for r2n_index in result2poem_id:
            msg = r2n_index + ' has ' + str(len(result2poem_id[r2n_index])) + ' poem_ids.'
            print(msg)
        print(str(len(no_rhymes)) + ' poems have no rhymes.')
        print('\t' + ', '.join(no_rhymes))
        print(str(len(irregular_poems)) + ' poems had irregularities and were not counted.')
        print('\t' + ', '.join(irregular_poems))
        #result2num_instances_d[r2n_index].append(num_naive_rhymes)
            #print(str(inc))
        not_sure = []
        for inc in range(1, 11817+1, 1): # irregular_poems, no_rhymes, poems_accounted_for
            poem_id = 'kyomeishusei2015.' + str(inc).zfill(5)
            if poem_id not in no_rhymes and poem_id not in poems_accounted_for and poem_id not in irregular_poems:
                not_sure.append(poem_id)
        print(str(len(irregular_poems)) + ' in irregular_poems.')
        print(str(len(no_rhymes)) + ' in no_rhymes.')
        print(str(len(poems_accounted_for)) + ' in poems_accounted_for.')
        print('Poems we aren\'t sure about: ' + str(len(not_sure)))
        print('\t' + ', '.join(not_sure))


#
# Purpose:
#   given two dictionaries where key='marker', and value='rhyme words',
#   determine if they are the same or not.
#   SAME = if for each marker the group of rhyme words is exactly the same
#   DIFFERENT = not SAME
def are_these_two_marker2rhyme_word_dictionaries_equal(m2rw_a, m2rw_b):
    funct_name = 'are_these_two_marker2rhyme_word_dictionaries_equal()'
    retval = False
    # if the number of markers is different, then they are not the same
    if len(m2rw_a) != len(m2rw_b):
        return retval
    len2rw_a = convert_marker2rw_word_to_len2rw_word_dict(m2rw_a)
    len2rw_b = convert_marker2rw_word_to_len2rw_word_dict(m2rw_b)
    for num_rw in len2rw_a:
        try:
            list_of_rw_lists_a = len2rw_a[num_rw] # this is a list of lists
            list_of_rw_lists_b = len2rw_b[num_rw]
        except IndexError as ie:
            return retval
        except KeyError as ke:
            return retval
        if len(list_of_rw_lists_a) != len(list_of_rw_lists_b):
            return retval

        for rw_list_a in list_of_rw_lists_a:
            rw_list_a.sort()
            if rw_list_a not in list_of_rw_lists_b:
                return retval
    retval = True
    return retval

def test_print_out_poems_given_ids():
    funct_name = 'test_print_out_poems_given_ids()'
    data_type = 'received_shi' #Lu1983.539; 047
    print_out_poems_given_ids(data_type, ['Lu1983.047'])


if 0:
    #
    # INPUT: poem as single string (lines end in '\n')
    def is_poem_annotated(poem):
        return re.search(r'[a-zA-Zα-ωΑ-Ω]', poem)

    def does_line_have_rhyme_marker(line):
        return is_poem_annotated(line)

#
# INPUT: poem as single string (lines end in '\n')
# ASSUMPTION: if first line doesn't have a rhyme marker, then it's not in one_rhyme_per_line format
#             This assumption will be false if the first line really would have a rhyme, but it's not
#             marked due to an anomaly (like being a square -- i.e., unrecognizable character)
def is_poem_in_one_rhyme_per_line_format(poem):
    poem = poem.split('\n')
    return re.search(r'[a-zA-Zα-ωΑ-Ω]', poem[0])
#
# INPUT: poem as single string (lines end in '\n')
def format_poem_as_one_rhyme_per_line(poem):
    if not is_poem_annotated(poem):
        return poem # if there are no annotated rhymes, then there's no formatting to do
    if is_poem_in_one_rhyme_per_line_format(poem):
        return poem # if it's already in the desired format, then there's nothing to do
    inc = 0
    retval = []
    full_line = ''
    poem = poem.split('\n')
    for line in poem:
        inc += 1
        if not inc % 2:  # for even lines...
            full_line += line
            retval.append(full_line)
        else:  # for odd lines...
            full_line = line
    return '\n'.join(retval)

#
# INPUT: poem as single string (lines end in '\n')
# NOTE:
#   doesn't work on Schuessler Anntation. Would need to add