From 94085cf783d9faa68f0da26554cb42aadce69e11 Mon Sep 17 00:00:00 2001
From: hsiayukoo_mindspore <hsiayukoo@outlook.com>
Date: Thu, 25 Apr 2024 10:44:13 +0800
Subject: [PATCH] fix blow bugs: (1)we should close() after open() (2)use
 meaningful variables (3)fixe some deep copy problems

fix blow bugs:
(1)we should close() after open()
(2)use meaningful variables
(3)fix some deep copy problems

add empty line

fix guides.

change some code comments
---
 .../finetune_with_a_custom_dataset.md         | 49 +++++++++++--------
 .../finetune_with_a_custom_dataset.md         | 47 +++++++++++-------
 examples/finetune/read_images_online.py       | 28 ++++++-----
 examples/finetune/split_files.py              | 16 +++---
 4 files changed, 81 insertions(+), 59 deletions(-)

diff --git a/docs/en/how_to_guides/finetune_with_a_custom_dataset.md b/docs/en/how_to_guides/finetune_with_a_custom_dataset.md
index cfc4e88ee..1fc6eb4c4 100644
--- a/docs/en/how_to_guides/finetune_with_a_custom_dataset.md
+++ b/docs/en/how_to_guides/finetune_with_a_custom_dataset.md
@@ -55,21 +55,24 @@ DATASET_NAME
 Next, we'll take the annotation file ./aircraft/data/images_variant_trainval.txt as an example, locally generate the file of train set ./aircraft/data/images/trainval/, which meets the request of a tree-structure directory.
 
 ```python
-import shutil
 import os
+import shutil
+
 
 # only for Aircraft dataset but not a general one
 def extract_images(images_path, subset_name, annotation_file_path, copy=True):
     # read the annotation file to get the label of each image
     def annotations(annotation_file_path):
         image_label = {}
-        for i in open(annotation_file_path, "r"):
-            label = " ".join(i.split(" ")[1:]).replace("\n", "").replace("/", "_")
-            if label not in image_label.keys():
-                image_label[label] = []
-                image_label[label].append(i.split(" ")[0])
-            else:
-                image_label[label].append(i.split(" ")[0])
+        with open(annotation_file_path, "r") as f:
+            lines = f.readlines()
+            for line in lines:
+                label = " ".join(line.split(" ")[1:]).replace("\n", "").replace("/", "_")
+                if label not in image_label.keys():
+                    image_label[label] = []
+                    image_label[label].append(line.split(" ")[0])
+                else:
+                    image_label[label].append(line.split(" ")[0])
         return image_label
 
     # make a new folder for subset
@@ -89,6 +92,7 @@ def extract_images(images_path, subset_name, annotation_file_path, copy=True):
                 shutil.move(images_path + image_name, label_folder)
 
 
+# take train set of aircraft dataset as an example
 images_path = "./aircraft/data/images/"
 subset_name = "trainval"
 annotation_file_path = "./aircraft/data/images_variant_trainval.txt"
@@ -138,6 +142,7 @@ Here's how we generate a random-accessible dataset object that stores the images
 
 ```python
 import numpy as np
+
 from mindspore.dataset import GeneratorDataset
 
 
@@ -145,11 +150,13 @@ class ImageClsDataset:
     def __init__(self, annotation_dir, images_dir):
         # Read annotations
         self.annotation = {}
-        for i in open(annotation_dir, "r"):
-            image_label = i.replace("\n", "").replace("/", "_").split(" ")
-            image = image_label[0] + ".jpg"
-            label = " ".join(image_label[1:])
-            self.annotation[image] = label
+        with open(annotation_dir, "r") as f:
+            lines = f.readlines()
+            for line in lines:
+                image_label = line.replace("\n", "").replace("/", "_").split(" ")
+                image = image_label[0] + ".jpg"
+                label = " ".join(image_label[1:])
+                self.annotation[image] = label
 
         # Transfer string-type label to int-type label
         self.label2id = {}
@@ -161,13 +168,13 @@ class ImageClsDataset:
             self.annotation[image] = self.label2id[label]
 
         # Read image-labels as mappable object
-        images = dict.fromkeys(self.label2id.values(), [])
+        label2images = {key: [] for key in self.label2id.values()}
         for image, label in self.annotation.items():
             read_image = np.fromfile(images_dir + image, dtype=np.uint8)
-            images[label].append(read_image)
+            label2images[label].append(read_image)
 
-        self._data = sum(list(images.values()), [])
-        self._label = sum([[i] * len(images[i]) for i in images.keys()], [])
+        self._data = sum(list(label2images.values()), [])
+        self._label = sum([[i] * len(label2images[i]) for i in label2images.keys()], [])
 
     # make class ImageClsDataset a mappable object
     def __getitem__(self, index):
@@ -176,10 +183,12 @@ class ImageClsDataset:
     def __len__(self):
         return len(self._data)
 
+
+# take aircraft dataset as an example
 annotation_dir = "./aircraft/data/images_variant_trainval.txt"
-images_dir = "./aircraft/data/iamges/"
-dataset = ImageClsDataset(annotation_dir)
-ataset_train = GeneratorDataset(source=dataset, column_names=["image", "label"], shuffle=True)
+images_dir = "./aircraft/data/images/"
+dataset = ImageClsDataset(annotation_dir, images_dir)
+dataset_train = GeneratorDataset(source=dataset, column_names=["image", "label"], shuffle=True)
 ```
 
 Compared with the offline way, the online way skipped the step of splitting the data file locally and reading the local file with the `create_dataset` function. So in the subsequent training, simply **replace the part of finetune.py that uses `create_dataset` with the above code**, then you can start training by running finetune.py directly as what you do after reading the dataset offline.
diff --git a/docs/zh/how_to_guides/finetune_with_a_custom_dataset.md b/docs/zh/how_to_guides/finetune_with_a_custom_dataset.md
index f76b29489..6dd6f6bf6 100644
--- a/docs/zh/how_to_guides/finetune_with_a_custom_dataset.md
+++ b/docs/zh/how_to_guides/finetune_with_a_custom_dataset.md
@@ -55,21 +55,25 @@ DATASET_NAME
 接下来以说明文件./aircraft/data/images_variant_trainval.txt 为例，在本地生成满足前述树状结构的训练集文件 ./aircraft/data/images/trainval/。
 
 ```python
-import shutil
+""" Extract images and generate ImageNet-style dataset directory """
 import os
+import shutil
+
 
 # only for Aircraft dataset but not a general one
 def extract_images(images_path, subset_name, annotation_file_path, copy=True):
     # read the annotation file to get the label of each image
     def annotations(annotation_file_path):
         image_label = {}
-        for i in open(annotation_file_path, "r"):
-            label = " ".join(i.split(" ")[1:]).replace("\n", "").replace("/", "_")
-            if label not in image_label.keys():
-                image_label[label] = []
-                image_label[label].append(i.split(" ")[0])
-            else:
-                image_label[label].append(i.split(" ")[0])
+        with open(annotation_file_path, "r") as f:
+            lines = f.readlines()
+            for line in lines:
+                label = " ".join(line.split(" ")[1:]).replace("\n", "").replace("/", "_")
+                if label not in image_label.keys():
+                    image_label[label] = []
+                    image_label[label].append(line.split(" ")[0])
+                else:
+                    image_label[label].append(line.split(" ")[0])
         return image_label
 
     # make a new folder for subset
@@ -89,6 +93,7 @@ def extract_images(images_path, subset_name, annotation_file_path, copy=True):
                 shutil.move(images_path + image_name, label_folder)
 
 
+# take train set of aircraft dataset as an example
 images_path = "./aircraft/data/images/"
 subset_name = "trainval"
 annotation_file_path = "./aircraft/data/images_variant_trainval.txt"
@@ -143,6 +148,7 @@ aircraft
 
 ```python
 import numpy as np
+
 from mindspore.dataset import GeneratorDataset
 
 
@@ -150,11 +156,13 @@ class ImageClsDataset:
     def __init__(self, annotation_dir, images_dir):
         # Read annotations
         self.annotation = {}
-        for i in open(annotation_dir, "r"):
-            image_label = i.replace("\n", "").replace("/", "_").split(" ")
-            image = image_label[0] + ".jpg"
-            label = " ".join(image_label[1:])
-            self.annotation[image] = label
+        with open(annotation_dir, "r") as f:
+            lines = f.readlines()
+            for line in lines:
+                image_label = line.replace("\n", "").replace("/", "_").split(" ")
+                image = image_label[0] + ".jpg"
+                label = " ".join(image_label[1:])
+                self.annotation[image] = label
 
         # Transfer string-type label to int-type label
         self.label2id = {}
@@ -166,13 +174,13 @@ class ImageClsDataset:
             self.annotation[image] = self.label2id[label]
 
         # Read image-labels as mappable object
-        images = dict.fromkeys(self.label2id.values(), [])
+        label2images = {key: [] for key in self.label2id.values()}
         for image, label in self.annotation.items():
             read_image = np.fromfile(images_dir + image, dtype=np.uint8)
-            images[label].append(read_image)
+            label2images[label].append(read_image)
 
-        self._data = sum(list(images.values()), [])
-        self._label = sum([[i] * len(images[i]) for i in images.keys()], [])
+        self._data = sum(list(label2images.values()), [])
+        self._label = sum([[i] * len(label2images[i]) for i in label2images.keys()], [])
 
     # make class ImageClsDataset a mappable object
     def __getitem__(self, index):
@@ -182,9 +190,10 @@ class ImageClsDataset:
         return len(self._data)
 
 
+# take aircraft dataset as an example
 annotation_dir = "./aircraft/data/images_variant_trainval.txt"
-images_dir = "./aircraft/data/iamges/"
-dataset = ImageClsDataset(annotation_dir)
+images_dir = "./aircraft/data/images/"
+dataset = ImageClsDataset(annotation_dir, images_dir)
 dataset_train = GeneratorDataset(source=dataset, column_names=["image", "label"], shuffle=True)
 
 ```
diff --git a/examples/finetune/read_images_online.py b/examples/finetune/read_images_online.py
index ab413bdbf..1c44ba009 100644
--- a/examples/finetune/read_images_online.py
+++ b/examples/finetune/read_images_online.py
@@ -8,11 +8,13 @@ class ImageClsDataset:
     def __init__(self, annotation_dir, images_dir):
         # Read annotations
         self.annotation = {}
-        for i in open(annotation_dir, "r"):
-            image_label = i.replace("\n", "").replace("/", "_").split(" ")
-            image = image_label[0] + ".jpg"
-            label = " ".join(image_label[1:])
-            self.annotation[image] = label
+        with open(annotation_dir, "r") as f:
+            lines = f.readlines()
+            for line in lines:
+                image_label = line.replace("\n", "").replace("/", "_").split(" ")
+                image = image_label[0] + ".jpg"
+                label = " ".join(image_label[1:])
+                self.annotation[image] = label
 
         # Transfer string-type label to int-type label
         self.label2id = {}
@@ -23,16 +25,16 @@ def __init__(self, annotation_dir, images_dir):
         for image, label in self.annotation.items():
             self.annotation[image] = self.label2id[label]
 
-        # Read image-labels as iterable object
-        images = dict.fromkeys(self.label2id.values(), [])
+        # Read image-labels as mappable object
+        label2images = {key: [] for key in self.label2id.values()}
         for image, label in self.annotation.items():
             read_image = np.fromfile(images_dir + image, dtype=np.uint8)
-            images[label].append(read_image)
+            label2images[label].append(read_image)
 
-        self._data = sum(list(images.values()), [])
-        self._label = sum([[i] * len(images[i]) for i in images.keys()], [])
+        self._data = sum(list(label2images.values()), [])
+        self._label = sum([[i] * len(label2images[i]) for i in label2images.keys()], [])
 
-    # make class ImageClsDataset an iterable object
+    # make class ImageClsDataset a mappable object
     def __getitem__(self, index):
         return self._data[index], self._label[index]
 
@@ -42,6 +44,6 @@ def __len__(self):
 
 # take aircraft dataset as an example
 annotation_dir = "./aircraft/data/images_variant_trainval.txt"
-images_dir = "./aircraft/data/iamges/"
-dataset = ImageClsDataset(annotation_dir)
+images_dir = "./aircraft/data/images/"
+dataset = ImageClsDataset(annotation_dir, images_dir)
 dataset_train = GeneratorDataset(source=dataset, column_names=["image", "label"], shuffle=True)
diff --git a/examples/finetune/split_files.py b/examples/finetune/split_files.py
index 9398136fd..8c193d5e4 100644
--- a/examples/finetune/split_files.py
+++ b/examples/finetune/split_files.py
@@ -8,13 +8,15 @@ def extract_images(images_path, subset_name, annotation_file_path, copy=True):
     # read the annotation file to get the label of each image
     def annotations(annotation_file_path):
         image_label = {}
-        for i in open(annotation_file_path, "r"):
-            label = " ".join(i.split(" ")[1:]).replace("\n", "").replace("/", "_")
-            if label not in image_label.keys():
-                image_label[label] = []
-                image_label[label].append(i.split(" ")[0])
-            else:
-                image_label[label].append(i.split(" ")[0])
+        with open(annotation_file_path, "r") as f:
+            lines = f.readlines()
+            for line in lines:
+                label = " ".join(line.split(" ")[1:]).replace("\n", "").replace("/", "_")
+                if label not in image_label.keys():
+                    image_label[label] = []
+                    image_label[label].append(line.split(" ")[0])
+                else:
+                    image_label[label].append(line.split(" ")[0])
         return image_label
 
     # make a new folder for subset