object-localization/voc_dataset.py at main · visual-learning/object-localization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
''' --------------------------------------------------------
Written by Yufei Ye (https://github.com/JudyYe)
Edited by Anirudh Chakravarthy (https://github.com/anirudh-chakravarthy)
-------------------------------------------------------- '''
import os
import xml.etree.ElementTree as ET

import numpy as np
from PIL import Image
import scipy.io
import torch
from torch.utils.data import Dataset
import torchvision.transforms as transforms


class VOCDataset(Dataset):
    CLASS_NAMES = [
        'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
        'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
        'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
    ]
    INV_CLASS = {}

    for i in range(len(CLASS_NAMES)):
        INV_CLASS[CLASS_NAMES[i]] = i

    # TODO: Ensure data directory is correct
    def __init__(
        self,
        split='trainval',
        image_size=224,
        top_n=300,
        data_dir='data/VOCdevkit/VOC2007/'
    ):
        super().__init__()
        self.split = split     # 'trainval' or 'test'
        self.data_dir = data_dir
        self.size = image_size
        self.top_n = top_n      # top_n: number of proposals to return

        self.img_dir = os.path.join(data_dir, 'JPEGImages')
        self.ann_dir = os.path.join(data_dir, 'Annotations')
        self.selective_search_dir = os.path.join(
            data_dir, 'selective_search_data')
        self.roi_data = scipy.io.loadmat(
            self.selective_search_dir + '/voc_2007_' + split + '.mat')

        split_file = os.path.join(data_dir, 'ImageSets/Main', split + '.txt')
        with open(split_file) as fp:
            self.index_list = [line.strip() for line in fp]

        self.anno_list = self.preload_anno()

    @classmethod
    def get_class_name(cls, index):
        """
        :return: category name for the corresponding class index.
        """
        return cls.CLASS_NAMES[index]

    @classmethod
    def get_class_index(cls, name):
        """
        :return: class index for the corresponding category name.
        """
        return cls.INV_CLASS[name]

    def __len__(self):
        return len(self.index_list)

    def preload_anno(self):
        """
        :return: a list of labels.
        each element is in the form of [class, weight, gt_class_list, gt_boxes]
         where both class and weight are arrays/tensors in shape of [20],
         gt_class_list is a list of the class ids (separate for each instance)
         gt_boxes is a list of [xmin, ymin, xmax, ymax] values in the range 0 to 1
        """

        # TODO: Make sure you understand how the GT boxes and class labels are loaded
        label_list = []

        for index in self.index_list:
            fpath = os.path.join(self.ann_dir, index + '.xml')
            tree = ET.parse(fpath)
            root = tree.getroot()

            C = np.zeros(20)
            W = np.ones(20) * 2 # default to enable 1 or 0 later for difficulty

            # image h & w to normalize bbox coords
            height = 0
            width = 0

            # new list for each index
            gt_class_list = []
            gt_boxes = []

            for child in root:

                if child.tag == 'size':
                    width = int(child[0].text)
                    height = int(child[1].text)

                if child.tag == 'object':
                    C[self.INV_CLASS[child[0].text]] = 1    # item at index of child name become 1
                    if child[3].text == '1' and W[self.INV_CLASS[child[0].text]] == 2:
                        W[self.INV_CLASS[child[0].text]] = 0    # if not difficult, weight is one
                    elif child[3].text == '0' :
                        W[self.INV_CLASS[child[0].text]] = 1

                    # add class_index to gt_class_list
                    gt_class_list.append(self.INV_CLASS[child[0].text])

                    for t in child:
                        if t.tag == 'bndbox':
                            xmin = int(t[0].text) / width
                            ymin = int(t[1].text) / height
                            xmax = int(t[2].text) / width
                            ymax = int(t[3].text) / height
                            gt_boxes.append([xmin, ymin, xmax, ymax])

            for i in range(len(W)):
                if W[i] == 2:
                    W[i] = 1

            label_list.append([C, W, gt_class_list, gt_boxes])

        return label_list

    def __getitem__(self, index):
        """
        :param index: a int generated by Dataloader in range [0, __len__()]
        :return: index-th element - containing all the aforementioned information
        """

        findex = self.index_list[index]     # findex refers to the file number
        fpath = os.path.join(self.img_dir, findex + '.jpg')

        img = Image.open(fpath)
        width, height = img.size

        img = transforms.ToTensor()(img)
        img = transforms.Resize((self.size, self.size))(img)
        img = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])(img)

        lab_vec = self.anno_list[index][0]
        wgt_vec = self.anno_list[index][1]

        label = torch.FloatTensor(lab_vec)
        wgt = torch.FloatTensor(wgt_vec)

        gt_class_list, gt_boxes = self.anno_list[index][2], self.anno_list[index][3]

        """
        TODO:
            1. Load bounding box proposals for the index from self.roi_data. The proposals are of the format:
            [y_min, x_min, y_max, x_max] or [top left row, top left col, bottom right row, bottom right col]
            2. Normalize in the range (0, 1) according to image size (be careful of width/height and x/y correspondences)
            3. Make sure to return only the top_n proposals based on proposal confidence ("boxScores")!
            4. You may have to write a custom collate_fn since some of the attributes below may be variable in number for each data point
        """
        proposals = None


        ret = {}
        ret['image'] = img
        ret['label'] = label
        ret['wgt'] = wgt
        ret['rois'] = proposals
        ret['gt_boxes'] = gt_boxes
        ret['gt_classes'] = gt_class_list
        return ret