update scripts and command

2023-08-15 13:34:36 +00:00
parent 6bfbbbf6a2
commit 5821d7bf8b
5 changed files with 118 additions and 17 deletions
--- a/preprocess_image.py
+++ b/preprocess_image.py
@@ -204,6 +204,7 @@ def process_single_image(image_path, depth_estimator, normal_estimator=None):
        print(f'[INFO] loading rgba image {rgba_path}...')
        rgba = cv2.cvtColor(cv2.imread(rgba_path, cv2.IMREAD_UNCHANGED), cv2.COLOR_BGRA2RGBA)
        image = cv2.cvtColor(rgba, cv2.COLOR_RGBA2RGB)
+
    else:
        print(f'[INFO] loading image {image_path}...')
        image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
@@ -213,9 +214,6 @@ def process_single_image(image_path, depth_estimator, normal_estimator=None):
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        print(f'[INFO] background removal...')
        rgba = BackgroundRemoval()(image)  # [H, W, 4]
-        cv2.imwrite(rgba_path, cv2.cvtColor(rgba, cv2.COLOR_RGBA2BGRA))
-        # rgba = get_rgba(image)  # [H, W, 4]
-        # cv2.imwrite(rgba_path.replace('rgba', 'rgba2'), cv2.cvtColor(rgba, cv2.COLOR_RGBA2BGRA))

    # Predict depth using Midas
    mask = rgba[..., -1] > 0
@@ -229,8 +227,30 @@ def process_single_image(image_path, depth_estimator, normal_estimator=None):
    # normal = (normal.clip(0, 1) * 255).astype(np.uint8).transpose(1, 2, 0)
    # normal[~mask] = 0
     
+    height, width, _ = image.shape
+    # Determine the padding needed to make the image square
+    if height > width:
+        left_padding = (height - width) // 2
+        right_padding = height - width - left_padding
+        padding = ((0, 0), (left_padding, right_padding), (0, 0))
+        padding2d = ((0, 0), (left_padding, right_padding))
+    elif width > height:
+        top_padding = (width - height) // 2
+        bottom_padding = width - height - top_padding
+        padding = ((top_padding, bottom_padding), (0, 0), (0, 0))
+        padding2d = ((top_padding, bottom_padding), (0, 0))
+    else:
+        padding = ((0, 0), (0, 0), (0, 0))
+        padding2d = ((0, 0), (0, 0))
+        
+    # Apply padding to the image
+    image = np.pad(image, padding, mode='constant', constant_values=0)
+    rgba = np.pad(rgba, padding, mode='constant', constant_values=0)
+    depth = np.pad(depth, padding2d, mode='constant', constant_values=0)
+
    cv2.imwrite(depth_path, depth)
    # cv2.imwrite(out_normal, cv2.cvtColor(normal, cv2.COLOR_RGB2BGR))
+    # breakpoint()
    if not os.path.exists(rgba_path):
        cv2.imwrite(rgba_path, cv2.cvtColor(rgba, cv2.COLOR_RGBA2BGRA))

@@ -238,13 +258,21 @@ if __name__ == '__main__':
    import glob
    parser = argparse.ArgumentParser()
    parser.add_argument('--path', default=None, type=str, nargs='*', help="path to image (png, jpeg, etc.)")
-    parser.add_argument('--folder', default=None, type=str, help="path to image (png, jpeg, etc.)")
+    parser.add_argument('--folder', default=None, type=str, help="path to a folder of image (png, jpeg, etc.)")
+    parser.add_argument('--imagepattern', default="image.png", type=str, help="image name pattern")
+    parser.add_argument('--exclude', default='', type=str, nargs='*', help="path to image (png, jpeg, etc.) to exclude")
    opt = parser.parse_args()

    depth_estimator = DepthEstimator()
    # normal_estimator = DPT(task='normal')
    
-    paths = opt.path if opt.path is not None else glob.glob(os.path.join(opt.folder, '*/rgba.png')) 
+    if opt.path is not None:
+        paths = opt.path
+    else:
+        paths = glob.glob(os.path.join(opt.folder, f'*/{opt.imagepattern}')) 
+        for exclude_path in opt.exclude:
+            if exclude_path in paths:
+                del paths[exclude_path] 
    for path in paths:
        process_single_image(path, depth_estimator, 
                            #  normal_estimator
--- a/readme.md
+++ b/readme.md
@@ -101,14 +101,14 @@ Takes ~40 mins for the coarse stage and ~20 mins for the second stage on a 32G V
 bash scripts/magic123/run_both_priors.sh $GPU_NO $JOBNAME_First_Stage $JOBNAME_Second_Stage $PATH_to_Example_Directory $IMAGE_BASE_NAME $Enable_First_Stage $Enable_Second_Stage {More_Arugments}
 ```

-As an example, run Magic123 in the dragon example using both stages in GPU 0 and set the jobname for the first stage as `default` and the jobname for the second stage as `dmtet`, by the following command:
+As an example, run Magic123 in the dragon example using both stages in GPU 0 and set the jobname for the first stage as `nerf` and the jobname for the second stage as `dmtet`, by the following command:
 ```bash
-bash scripts/magic123/run_both_priors.sh 0 default dmtet data/realfusion15/metal_dragon_statue rgba.png 1 1 
+bash scripts/magic123/run_both_priors.sh 0 nerf dmtet data/realfusion15/metal_dragon_statue 1 1 
 ```

 More arguments (e.g. `--lambda_guidance 1 40`) can be appended to the command line such as:
 ```bash
-bash scripts/magic123/run_both_priors.sh 0 default dmtet data/realfusion15/metal_dragon_statue rgba.png 1 1 --lambda_guidance 1 40
+bash scripts/magic123/run_both_priors.sh 0 nerf dmtet data/realfusion15/metal_dragon_statue 1 1 --lambda_guidance 1 40
 ```

 ### Run Magic123 for a group of examples
@@ -117,11 +117,11 @@ bash scripts/magic123/run_both_priors.sh 0 default dmtet data/realfusion15/metal


 ### Run Magic123 on a single example without textual inversion
-textual inversion is tedious (requires ~2.5 hours optimization), if you want to test Magic123 quickly on your own example without texural inversion (might degrade the performance), try the following:
+textual inversion is tedious (requires ~2.5 hours optimization), if you want to test Magic123 quickly on your own example without textual inversion (might degrade the performance), try the following:

 - first, foreground and depth estimation
    ```
-    python preprocess_image.py --path data/demo/ironman/ironman.png
+    python preprocess_image.py --path data/demo/ironman/main.png
    ```

 - Run Magic123 coarse stage without textual inversion, takes ~40 mins
@@ -177,7 +177,7 @@ textual inversion is tedious (requires ~2.5 hours optimization), if you want to
 ### Run ablation studies
 - Run Magic123 with only 2D prior *with* textual inversion (Like RealFusion but we achieve much better performance through training stragies and the coarse-to-fine pipeline)
    ```
-    bash scripts/magic123/run_2dprior.sh 0 default dmtet data/realfusion15/metal_dragon_statue rgba.png 1 1
+    bash scripts/magic123/run_2dprior.sh 0 nerf dmtet data/realfusion15/metal_dragon_statue 1 1
    ```

 - Run Magic123 with only 2D prior *without* textual inversion (Like RealFusion but we achieve much better performance through training stragies and the coarse-to-fine pipeline)
@@ -188,7 +188,7 @@ textual inversion is tedious (requires ~2.5 hours optimization), if you want to

 - Run Magic123 with only 3D prior (Like Zero-1-to-3 but we achieve much better performance through training stragies and the coarse-to-fine pipeline)
    ```
-    bash scripts/magic123/run_3dprior.sh 0 default dmtet data/demo/ironman rgba.png 1 1
+    bash scripts/magic123/run_3dprior.sh 0 nerf dmtet data/demo/ironman 1 1
    ```


@@ -197,7 +197,7 @@ textual inversion is tedious (requires ~2.5 hours optimization), if you want to
 2. Smaller range of time steps for the defusion noise (t_range). We find *[0.2, 0.6]* gives better performance for image-to-3D tasks. 
 3. Using normals as latent in the first 2000 improves generated geometry a bit gernerally (but not always). We turn on this for Magic123 corase stage in the script `--normal_iter_ratio 0.2` 
 4. We erode segmentation edges (makes the segmentation map 2 pixels shrinked towards internal side) to remove artifacts due to segmentation erros. This is turned on in the fine stage in magic123 in the script through `--rm_edge`
-5. Other general tricks such as improved texural inversion, advanced diffusion prior (DeepFloyd, SD-XL), stronger 3D prior (Zero123-XL), and larger batch size can be adopted as well but not studied in this work.
+5. Other general tricks such as improved textual inversion, advanced diffusion prior (DeepFloyd, SD-XL), stronger 3D prior (Zero123-XL), and larger batch size can be adopted as well but not studied in this work.
 6. textual inversion is not very necessary for well-known things (e.g. ironman) and easily described textures and geoemtries, since pure texts contains these texture information and will be understood by diffusion models. We use textual inversion by default in all experiments.

 # Acknowledgement
--- a/scripts/magic123/preprocess_folder.sh
+++ b/scripts/magic123/preprocess_folder.sh
@@ -2,5 +2,5 @@ topdir=$1
 imagename=$2 # rgba.png or image.png
 for i in $topdir/*; do
    echo preprocessing "$i"/$imagename ...
-    python scripts/preprocess_image.py "$i"/$imagename
+    python preprocess_image.py "$i"/$imagename
 done
--- a/scripts/magic123/run_2dprior_noinv.sh
+++ b/scripts/magic123/run_2dprior_noinv.sh
@@ -0,0 +1,73 @@
+#! /bin/bash
+#SBATCH -N 1
+#SBATCH --array=0
+#SBATCH -J magic123
+#SBATCH -o slurm_logs/%x.%3a.%A.out
+#SBATCH -e slurm_logs/%x.%3a.%A.err
+#SBATCH --time=3:00:00
+#SBATCH --gres=gpu:v100:1
+#SBATCH --cpus-per-gpu=6
+#SBATCH --mem=30G
+##SBATCH --gpus=1
+
+module load gcc/7.5.0
+
+
+#source ~/.bashrc
+#source activate magic123
+source venv_magic123/bin/activate
+which python 
+
+nvidia-smi
+nvcc --version
+
+hostname
+NUM_GPU_AVAILABLE=`nvidia-smi --query-gpu=name --format=csv,noheader | wc -l`
+echo "number of gpus:" $NUM_GPU_AVAILABLE
+
+RUN_ID=$2
+RUN_ID2=$3
+DATA_DIR=$4
+IMAGE_NAME=rgba.png
+step1=$5
+step2=$6
+FILENAME=$(basename $DATA_DIR)
+dataset=$(basename $(dirname $DATA_DIR))
+echo reconstruct $FILENAME under dataset $dataset from folder $DATA_DIR ...
+
+if (( ${step1} )); then
+    CUDA_VISIBLE_DEVICES=$1 python main.py -O \
+        --text "A high-resolution DSLR image of <token>" \
+        --sd_version 1.5 \
+        --image ${DATA_DIR}/${IMAGE_NAME} \
+        --workspace out/magic123-2d/magic123-2d-${RUN_ID}-coarse/$dataset/magic123_2d_${FILENAME}_${RUN_ID}_coarse \
+        --optim adam \
+        --iters 5000 \
+        --guidance SD \
+        --lambda_guidance 1 \
+        --guidance_scale 100 \
+        --latent_iter_ratio 0 \
+        --normal_iter_ratio 0.2 \
+        --t_range 0.2 0.6 \
+        --bg_radius -1 \
+        --save_mesh \
+        ${@:7}
+fi
+
+if (( ${step2} )); then
+    CUDA_VISIBLE_DEVICES=$1 python main.py -O \
+        --text "A high-resolution DSLR image of <token>" \
+        --sd_version 1.5 \
+        --image ${DATA_DIR}/${IMAGE_NAME} \
+        --workspace out/magic123-2d/magic123-2d-${RUN_ID}-${RUN_ID2}/$dataset/magic123_2d_${FILENAME}_${RUN_ID}_${RUN_ID2} \
+        --dmtet --init_ckpt out/magic123-2d/magic123-2d-${RUN_ID}-coarse/$dataset/magic123_2d_${FILENAME}_${RUN_ID}_coarse/checkpoints/magic123_2d_${FILENAME}_${RUN_ID}_coarse.pth \
+        --iters 5000 \
+        --optim adam \
+        --latent_iter_ratio 0 \
+        --guidance SD \
+        --lambda_guidance 1e-3 \
+        --guidance_scale 100 \
+        --rm_edge \
+        --bg_radius -1 \
+        --save_mesh 
+fi
--- a/scripts/magic123/run_folder_both_priors.sh
+++ b/scripts/magic123/run_folder_both_priors.sh
@@ -9,5 +9,5 @@ step2=$7
 for i in $topdir/*; do
    echo "$i"
    [ -d "$i" ] && echo "$i exists."
-    bash ${script_name} $device $runid "$i" $step1 $step2 ${@:8}
+    bash ${script_name} $device $runid $runid2 "$i" $step1 $step2 ${@:8}
 done