taoxugit · ImmortalBoi · Mar 7, 2019 · Mar 7, 2019 · Mar 19, 2019 · Dec 3, 2023
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# AttnGAN
+# AttnGAN (Python 3, Pytorch 1.0)
 
 Pytorch implementation for reproducing AttnGAN results in the paper [AttnGAN: Fine-Grained Text to Image Generation
 with Attentional Generative Adversarial Networks](http://openaccess.thecvf.com/content_cvpr_2018/papers/Xu_AttnGAN_Fine-Grained_Text_CVPR_2018_paper.pdf) by Tao Xu, Pengchuan Zhang, Qiuyuan Huang, Han Zhang, Zhe Gan, Xiaolei Huang, Xiaodong He. (This work was performed when Tao was an intern with Microsoft Research). 
@@ -7,9 +7,9 @@ with Attentional Generative Adversarial Networks](http://openaccess.thecvf.com/c
 
 
 ### Dependencies
-python 2.7
+python 3.6+
 
-Pytorch
+Pytorch 1.0+
 
 In addition, please add the project folder to PYTHONPATH and `pip install` the following packages:
 - `python-dateutil`
@@ -27,7 +27,10 @@ In addition, please add the project folder to PYTHONPATH and `pip install` the f
 2. Download the [birds](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html) image data. Extract them to `data/birds/`
 3. Download [coco](http://cocodataset.org/#download) dataset and extract the images to `data/coco/`
 
-
+**Expected Dataset Folder Structure in YML**
+<div>- dataset</div>
+<div>|- images</div>
+<p>|- text</p>
 
 **Training**
 - Pre-train DAMSM models:
@@ -40,8 +43,6 @@ In addition, please add the project folder to PYTHONPATH and `pip install` the f
 
 - `*.yml` files are example configuration files for training/evaluation our models.
 
-
-
 **Pretrained Model**
 - [DAMSM for bird](https://drive.google.com/open?id=1GNUKjVeyWYBJ8hEU-yrfYQpDOkxEyP3V). Download and save it to `DAMSMencoders/`
 - [DAMSM for coco](https://drive.google.com/open?id=1zIrXCE9F6yfbEJIbNP5-YrEe2pZcPSGJ). Download and save it to `DAMSMencoders/`

diff --git a/code/GlobalAttention.py b/code/GlobalAttention.py
@@ -48,7 +48,7 @@ def func_attention(query, context, gamma1):
     attn = torch.bmm(contextT, query) # Eq. (7) in AttnGAN paper
     # --> batch*sourceL x queryL
     attn = attn.view(batch_size*sourceL, queryL)
-    attn = nn.Softmax()(attn)  # Eq. (8)
+    attn = nn.Softmax(dim=1)(attn)  # Eq. (8)
 
     # --> batch x sourceL x queryL
     attn = attn.view(batch_size, sourceL, queryL)
@@ -57,7 +57,7 @@ def func_attention(query, context, gamma1):
     attn = attn.view(batch_size*queryL, sourceL)
     #  Eq. (9)
     attn = attn * gamma1
-    attn = nn.Softmax()(attn)
+    attn = nn.Softmax(dim=1)(attn)
     attn = attn.view(batch_size, queryL, sourceL)
     # --> batch x sourceL x queryL
     attnT = torch.transpose(attn, 1, 2).contiguous()
@@ -73,7 +73,7 @@ class GlobalAttentionGeneral(nn.Module):
     def __init__(self, idf, cdf):
         super(GlobalAttentionGeneral, self).__init__()
         self.conv_context = conv1x1(cdf, idf)
-        self.sm = nn.Softmax()
+        self.sm = nn.Softmax(dim=1)
         self.mask = None
 
     def applyMask(self, mask):
@@ -104,7 +104,7 @@ def forward(self, input, context):
         attn = attn.view(batch_size*queryL, sourceL)
         if self.mask is not None:
             # batch_size x sourceL --> batch_size*queryL x sourceL
-            mask = self.mask.repeat(queryL, 1)
+            mask = self.mask.repeat(queryL, 1).to(torch.bool)
             attn.data.masked_fill_(mask.data, -float('inf'))
         attn = self.sm(attn)  # Eq. (2)
         # --> batch x queryL x sourceL

diff --git a/code/datasets.py b/code/datasets.py
@@ -3,7 +3,6 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-
 from nltk.tokenize import RegexpTokenizer
 from collections import defaultdict
 from miscc.config import cfg
@@ -13,6 +12,10 @@
 from torch.autograd import Variable
 import torchvision.transforms as transforms
 
+import os
+import shutil
+from sklearn.model_selection import train_test_split
+
 import os
 import sys
 import numpy as np
@@ -80,7 +83,7 @@ def get_imgs(img_path, imsize, bbox=None,
         for i in range(cfg.TREE.BRANCH_NUM):
             # print(imsize[i])
             if i < (cfg.TREE.BRANCH_NUM - 1):
-                re_img = transforms.Scale(imsize[i])(img)
+                re_img = transforms.Resize(imsize[i])(img)
             else:
                 re_img = img
             ret.append(normalize(re_img))
@@ -133,7 +136,7 @@ def load_bbox(self):
         #
         filename_bbox = {img_file[:-4]: [] for img_file in filenames}
         numImgs = len(filenames)
-        for i in xrange(0, numImgs):
+        for i in range(0, numImgs):
             # bbox = [x-left, y-top, width, height]
             bbox = df_bounding_boxes.iloc[i][1:].tolist()
 
@@ -142,12 +145,12 @@ def load_bbox(self):
         #
         return filename_bbox
 
-    def load_captions(self, data_dir, filenames):
+    def load_captions(self, data_dir, filenames:list[str], split):
         all_captions = []
         for i in range(len(filenames)):
-            cap_path = '%s/text/%s.txt' % (data_dir, filenames[i])
+            cap_path = '%s/%s/text/%s.txt' % (data_dir, split, filenames[i])
             with open(cap_path, "r") as f:
-                captions = f.read().decode('utf8').split('\n')
+                captions = f.read().split('\n')
                 cnt = 0
                 for cap in captions:
                     if len(cap) == 0:
@@ -221,8 +224,8 @@ def load_text_data(self, data_dir, split):
         train_names = self.load_filenames(data_dir, 'train')
         test_names = self.load_filenames(data_dir, 'test')
         if not os.path.isfile(filepath):
-            train_captions = self.load_captions(data_dir, train_names)
-            test_captions = self.load_captions(data_dir, test_names)
+            train_captions = self.load_captions(data_dir, train_names, "train")
+            test_captions = self.load_captions(data_dir, test_names, "test")
 
             train_captions, test_captions, ixtoword, wordtoix, n_words = \
                 self.build_dictionary(train_captions, test_captions)
@@ -251,7 +254,7 @@ def load_text_data(self, data_dir, split):
     def load_class_id(self, data_dir, total_num):
         if os.path.isfile(data_dir + '/class_info.pickle'):
             with open(data_dir + '/class_info.pickle', 'rb') as f:
-                class_id = pickle.load(f)
+                class_id = pickle.load(f, encoding="bytes")
         else:
             class_id = np.arange(total_num)
         return class_id
@@ -262,13 +265,69 @@ def load_filenames(self, data_dir, split):
             with open(filepath, 'rb') as f:
                 filenames = pickle.load(f)
             print('Load filenames from: %s (%d)' % (filepath, len(filenames)))
+
+            return filenames
+
         else:
-            filenames = []
-        return filenames
+            image_dir = os.path.join(data_dir, 'images')
+            text_dir = os.path.join(data_dir, 'text')
+
+            image_files = sorted(os.listdir(image_dir))
+
+            filenames = [os.path.splitext(f)[0] for f in image_files]
+
+            train_filenames, test_filenames = train_test_split(filenames, test_size=0.3)
+
+            image_train = [f + '.jpg' for f in train_filenames]
+            text_train = [f + '.txt' for f in train_filenames]
+
+            image_test = [f + '.jpg' for f in test_filenames]
+            text_test = [f + '.txt' for f in test_filenames]
+
+            train_image_dir = os.path.join(data_dir, 'train/images')
+            test_image_dir = os.path.join(data_dir, 'test/images')
+            train_text_dir = os.path.join(data_dir, 'train/text')
+            test_text_dir = os.path.join(data_dir, 'test/text')
+
+            os.makedirs(train_image_dir, exist_ok=True)
+            os.makedirs(test_image_dir, exist_ok=True)
+            os.makedirs(train_text_dir, exist_ok=True)
+            os.makedirs(test_text_dir, exist_ok=True)
+
+            for file in image_train:
+                shutil.move(os.path.join(image_dir, file), train_image_dir)
+
+            for file in image_test:
+                shutil.move(os.path.join(image_dir, file), test_image_dir)
+
+            for file in text_train:
+                shutil.move(os.path.join(text_dir, file), train_text_dir)
+
+            for file in text_test:
+                shutil.move(os.path.join(text_dir, file), test_text_dir)
+
+            os.rmdir(image_dir)
+            os.rmdir(text_dir)
+
+            with open('%s/%s/filenames.pickle' % (data_dir, "train"), 'wb') as f:
+                pickle.dump(train_filenames, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+            with open('%s/%s/filenames.pickle' % (data_dir, "test"), 'wb') as f:
+                pickle.dump(test_filenames, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+            print('Create pickle and Load filenames from: %s (%d)' % (filepath, len(filenames)))
+
+            if split == 'train':
+                return train_filenames
+            else:
+                return test_filenames 
 
     def get_caption(self, sent_ix):
         # a list of indices for a sentence
-        sent_caption = np.asarray(self.captions[sent_ix]).astype('int64')
+        try:
+            sent_caption = np.asarray(self.captions[sent_ix]).astype('int64')
+        except:
+            print(f'Getting caption for sent_ix: {sent_ix}')
         if (sent_caption == 0).sum() > 0:
             print('ERROR: do not need END (0) token', sent_caption)
         num_words = len(sent_caption)
@@ -298,13 +357,24 @@ def __getitem__(self, index):
             bbox = None
             data_dir = self.data_dir
         #
-        img_name = '%s/images/%s.jpg' % (data_dir, key)
+        img_name = ""
+        if os.path.isfile('%s/%s/images/%s.jpg' % (data_dir, "train", key)):
+            img_name = '%s/%s/images/%s.jpg' % (data_dir, "train", key)
+
+        if os.path.isfile('%s/%s/images/%s.jpg' % (data_dir, "test", key)):
+            img_name = '%s/%s/images/%s.jpg' % (data_dir, "test", key)
+
         imgs = get_imgs(img_name, self.imsize,
                         bbox, self.transform, normalize=self.norm)
         # random select a sentence
         sent_ix = random.randint(0, self.embeddings_num)
         new_sent_ix = index * self.embeddings_num + sent_ix
-        caps, cap_len = self.get_caption(new_sent_ix)
+        try:
+            caps, cap_len = self.get_caption(new_sent_ix)
+        except Exception as error:
+            print(error)
+            print(f'index: {index}, new_sent_ix: {new_sent_ix}, sent_ix: {sent_ix}, len(self.captions): {len(self.captions)}')
+            caps, cap_len = self.get_caption(new_sent_ix-1)
         return imgs, caps, cap_len, cls_id, key
 
 

diff --git a/code/main.py b/code/main.py
@@ -4,6 +4,8 @@
 from datasets import TextDataset
 from trainer import condGANTrainer as trainer
 
+from pathlib import Path
+
 import os
 import sys
 import time
@@ -39,14 +41,14 @@ def gen_example(wordtoix, algo):
     filepath = '%s/example_filenames.txt' % (cfg.DATA_DIR)
     data_dic = {}
     with open(filepath, "r") as f:
-        filenames = f.read().decode('utf8').split('\n')
+        filenames = f.read().split('\n')
         for name in filenames:
             if len(name) == 0:
                 continue
             filepath = '%s/%s.txt' % (cfg.DATA_DIR, name)
             with open(filepath, "r") as f:
                 print('Load from:', name)
-                sentences = f.read().decode('utf8').split('\n')
+                sentences = f.read().split('\n')
                 # a list of indices for a sentence
                 captions = []
                 cap_lens = []
@@ -110,8 +112,8 @@ def gen_example(wordtoix, algo):
 
     now = datetime.datetime.now(dateutil.tz.tzlocal())
     timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
-    output_dir = '../output/%s_%s_%s' % \
-        (cfg.DATASET_NAME, cfg.CONFIG_NAME, timestamp)
+    output_dir = '%s/../output/%s_%s_%s' % \
+        (str(Path(cfg.DATA_DIR).parent.parent) ,cfg.DATASET_NAME, cfg.CONFIG_NAME, timestamp)
 
     split_dir, bshuffle = 'train', True
     if not cfg.TRAIN.FLAG:
@@ -121,7 +123,7 @@ def gen_example(wordtoix, algo):
     # Get data loader
     imsize = cfg.TREE.BASE_SIZE * (2 ** (cfg.TREE.BRANCH_NUM - 1))
     image_transform = transforms.Compose([
-        transforms.Scale(int(imsize * 76 / 64)),
+        transforms.Resize(int(imsize * 76 / 64)),
         transforms.RandomCrop(imsize),
         transforms.RandomHorizontalFlip()])
     dataset = TextDataset(cfg.DATA_DIR, split_dir,

diff --git a/code/miscc/config.py b/code/miscc/config.py
@@ -70,9 +70,9 @@ def _merge_a_into_b(a, b):
     if type(a) is not edict:
         return
 
-    for k, v in a.iteritems():
+    for k, v in a.items():
         # a must specify keys that are in b
-        if not b.has_key(k):
+        if k not in b:
             raise KeyError('{} is not a valid config key'.format(k))
 
         # the types must match, too
@@ -100,6 +100,6 @@ def cfg_from_file(filename):
     """Load a config file and merge it into the default options."""
     import yaml
     with open(filename, 'r') as f:
-        yaml_cfg = edict(yaml.load(f))
+        yaml_cfg = edict(yaml.safe_load(f))
 
     _merge_a_into_b(yaml_cfg, __C)
diff --git a/code/miscc/losses.py b/code/miscc/losses.py
@@ -49,7 +49,8 @@ def sent_loss(cnn_code, rnn_code, labels, class_ids,
     # --> batch_size x batch_size
     scores0 = scores0.squeeze()
     if class_ids is not None:
-        scores0.data.masked_fill_(masks, -float('inf'))
+        masks_bool = masks.to(torch.bool)
+        scores0.data.masked_fill_(masks_bool, -float('inf'))
     scores1 = scores0.transpose(0, 1)
     if labels is not None:
         loss0 = nn.CrossEntropyLoss()(scores0, labels)
@@ -122,7 +123,8 @@ def words_loss(img_features, words_emb, labels,
 
     similarities = similarities * cfg.TRAIN.SMOOTH.GAMMA3
     if class_ids is not None:
-        similarities.data.masked_fill_(masks, -float('inf'))
+        masks_bool = masks.to(torch.bool)
+        similarities.data.masked_fill_(masks_bool, -float('inf'))
     similarities1 = similarities.transpose(0, 1)
     if labels is not None:
         loss0 = nn.CrossEntropyLoss()(similarities, labels)
@@ -181,7 +183,7 @@ def generator_loss(netsD, image_encoder, fake_imgs, real_labels,
             g_loss = cond_errG
         errG_total += g_loss
         # err_img = errG_total.data[0]
-        logs += 'g_loss%d: %.2f ' % (i, g_loss.data[0])
+        logs += 'g_loss%d: %.2f ' % (i, g_loss.item())
 
         # Ranking loss
         if i == (numDs - 1):
@@ -202,7 +204,7 @@ def generator_loss(netsD, image_encoder, fake_imgs, real_labels,
             # err_sent = err_sent + s_loss.data[0]
 
             errG_total += w_loss + s_loss
-            logs += 'w_loss: %.2f s_loss: %.2f ' % (w_loss.data[0], s_loss.data[0])
+            logs += 'w_loss: %.2f s_loss: %.2f ' % (w_loss.item(), s_loss.item())
     return errG_total, logs