Production of NUS-WIDE-10K dataset

Keywords: Python

NUS-WIDE is a multi label dataset, which contains 269648 samples and can be divided into 81 classes. Recently, I was doing a cross modal retrieval project and found that many papers used the NUS-WIDE-10K dataset, but I didn't find the relevant dataset download link or dataset creation code on the Internet, so I wrote a NUS-WIDE-10K dataset creation code myself.

1. Introduction

The earliest paper I found on the Internet on the description of NUS-WIDE-10K dataset is: cross modal retrieval with correlation autoencoder. The description is as follows: the author selects the 10 largest classes in the NUS-WIDE dataset: animal, clouds, owners, food, grass, person, sky, toy, water and window, and selects 1000 pictures (10000 in total) from each class as the NUS-WIDE-10K dataset. NUS-WIDE-10K is randomly divided into three subsets: training set, verification set and test set. The number of samples in each set is 8000, 1000 and 1000 respectively.

2. Data set analysis

Let's take a look at the main structure of the NUS-WIDE dataset

Flickr folder: the Flickr folder contains all the pictures in the NUS-WIDE dataset. There are 704 folders, and each folder represents a kind of pictures (remember that NUS-WIDE is a multi label dataset? Yes, so each picture has two labels)
Groundtruth folder: after decompression, there are AllLabels/ and TrainTestLabels/ Two directories, alllabels/ There are 81. txt files in total. Each file contains 269648 lines of 0 / 1 data, representing whether the data belongs to this category.
ImageList folder: there are three files in this folder: Imagelist.txt, TestImagelist.txt and TrainImagelist.txt. We only use Imagelist.txt, which lists the storage addresses of all pictures of the dataset in order.
NUS_WID_Tags folder: the folder has multiple files, but we only use All_Tags.txt, which stores the text descriptions of all pictures in order.
Concepts81.txt: contains the class name of 81 categories.

3. Code

extract

Extract the id value of each category sample of animal, clouds, owners, food, grass, person, sky, toy, water and window.
1000 samples were randomly selected from each category.

import os
import numpy as np
import random
from tqdm import tqdm
import shutil
import sys

N_SAMPLE = 269648
label_dir = "Groundtruth/AllLabels"
image_dir = 'ImageList/ImageList.txt'
txt_dir = 'NUS_WID_Tags/All_Tags.txt'
output_dir = 'NUS_WIDE_10K/NUS_WIDE_10k.list'
classes = ['animal', 'clouds', 'flowers', 'food', 'grass', 'person', 'sky', 'toy', 'water', 'window']   #10 categories of NUS-WIDE-10K dataset

print('loading all class names')
cls_id = {}
with open("Concepts81.txt", "r") as f:
    for cid, line in enumerate(f):
        cn = line.strip()
        cls_id[cn] = cid
id_cls = {cls_id[k]: k for k in cls_id}
print('Finished, with {} classes.'.format(len(id_cls)))

print('Extract and sample id from label files')
data_list = {}
class_files = os.listdir(label_dir)
class_files.remove('Labels_waterfall.txt')  #Remove labels manually_ Waterfall.txt this file prevents errors
for class_file in class_files:
    for clas in classes:
        if clas in class_file:
            print('class_file:' + class_file)
            with open(os.path.join(label_dir, class_file), "r") as f:
                i = []
                for sid, line in enumerate(f):
                    if int(line) > 0:
                        i.append(sid)
                print('total samples of {}:'.format(clas) + str(len(i)))
                data_list[clas] = random.sample(i, 1000)    #Randomly select 1000 data for each category
                print('sample number of ' + clas + ':{}\n'.format(len(data_list[clas])))

Generate NUS_WIDE_10k.list file

For ease of use, we created NUS_WIDE_10k.list file is used to store (picture address, text description, sample category).

images = []
txts = []
with open(image_dir, "r") as f:
    for line in f:
        line = line.strip()
        images.append(line)
with open(txt_dir, "r", encoding='utf-8') as f:
    for line in f:
        line = line.strip().split('      ')
        txts.append(line[-1])
print('images:{}  text:{}'.format(len(images), len(txts)))

print('Write the list')
with open(output_dir, "w", encoding='utf-8') as f:
    for clas in classes:
        for i in data_list[clas]:
            f.write('{}      {}      {}\n'.format(images[i].split('\\')[0] + '/' + images[i].split('\\')[1], txts[i], clas))

Divide image training set and test set

For each category, there are 800, 100, 100 images for training, validation and testing.

def split_train_val_nus_wide_10k(): #Divide the training set and test set of pictures
    count = 0
    print('Split training and test set for images')
    with open(output_dir, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            print("\r", end="")
            print("Download progress: {}%: ".format(i/100), "▋" * (i // 100), end="")
            sys.stdout.flush()
            count += 1
            line = line.strip().split('      ')
            img = line[0]
            doc = line[-1]
            if count % 1000 < 900:
                new_path = 'NUS_WIDE_10K/image_split/train/'
            else:
                new_path = 'NUS_WIDE_10K/image_split/val/'
            if not os.path.exists(new_path + doc):
                os.mkdir(new_path + doc)
            image_path = 'Flickr/' + img
            shutil.copyfile(image_path, new_path + doc + '/' + img.split('/')[-1])

Complete code

import os
import numpy as np
import random
from tqdm import tqdm
import shutil
import sys

N_SAMPLE = 269648
label_dir = "Groundtruth/AllLabels"
image_dir = 'ImageList/ImageList.txt'
txt_dir = 'NUS_WID_Tags/All_Tags.txt'
output_dir = 'NUS_WIDE_10K/NUS_WIDE_10k.list'
classes = ['animal', 'clouds', 'flowers', 'food', 'grass', 'person', 'sky', 'toy', 'water', 'window']   #10 categories of NUS-WIDE-10K dataset

print('loading all class names')
cls_id = {}
with open("Concepts81.txt", "r") as f:
    for cid, line in enumerate(f):
        cn = line.strip()
        cls_id[cn] = cid
id_cls = {cls_id[k]: k for k in cls_id}
print('Finished, with {} classes.'.format(len(id_cls)))

print('Extract and sample id from label files')
data_list = {}
class_files = os.listdir(label_dir)
class_files.remove('Labels_waterfall.txt')  #Remove labels manually_ Waterfall.txt this file prevents errors
for class_file in class_files:
    for clas in classes:
        if clas in class_file:
            print('class_file:' + class_file)
            with open(os.path.join(label_dir, class_file), "r") as f:
                i = []
                for sid, line in enumerate(f):
                    if int(line) > 0:
                        i.append(sid)
                print('total samples of {}:'.format(clas) + str(len(i)))
                data_list[clas] = random.sample(i, 1000)    #Randomly select 1000 data for each category
                print('sample number of ' + clas + ':{}\n'.format(len(data_list[clas])))

print('Extract all images and text')
images = []
txts = []
with open(image_dir, "r") as f:
    for line in f:
        line = line.strip()
        images.append(line)
with open(txt_dir, "r", encoding='utf-8') as f:
    for line in f:
        line = line.strip().split('      ')
        txts.append(line[-1])
print('images:{}  text:{}'.format(len(images), len(txts)))

print('Write the list')
with open(output_dir, "w", encoding='utf-8') as f:
    for clas in classes:
        for i in data_list[clas]:
            f.write('{}      {}      {}\n'.format(images[i].split('\\')[0] + '/' + images[i].split('\\')[1], txts[i], clas))
print('Finished!')


def split_train_val_nus_wide_10k(): #Divide the training set and test set of pictures
    count = 0
    print('Split training and test set for images')
    with open(output_dir, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            print("\r", end="")
            print("Download progress: {}%: ".format(i/100), "▋" * (i // 100), end="")
            sys.stdout.flush()
            count += 1
            line = line.strip().split('      ')
            img = line[0]
            doc = line[-1]
            if count % 1000 < 900:
                new_path = 'NUS_WIDE_10K/image_split/train/'
            else:
                new_path = 'NUS_WIDE_10K/image_split/val/'
            if not os.path.exists(new_path + doc):
                os.mkdir(new_path + doc)
            image_path = 'Flickr/' + img
            shutil.copyfile(image_path, new_path + doc + '/' + img.split('/')[-1])

split_train_val_nus_wide_10k()

Posted by ambrennan on Tue, 12 Oct 2021 12:23:47 -0700

Programmer Group