#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May 20 13:41:58 2025
@author: kangchaewon
"""
import os, sys
sys.path.append(os.path.join(os.getcwd(), "./content/GSA/GroundingDINO"))
from PIL import Image
# Grounding DINO
import content.GSA.GroundingDINO.groundingdino.datasets.transforms as T
from content.GSA.GroundingDINO.groundingdino.models import build_model
from content.GSA.GroundingDINO.groundingdino.util import box_ops
from content.GSA.GroundingDINO.groundingdino.util.slconfig import SLConfig
from content.GSA.GroundingDINO.groundingdino.util.utils import clean_state_dict
from content.GSA.GroundingDINO.groundingdino.util.inference import annotate, load_image, predict
# segment anything
from segment_anything import build_sam, SamPredictor
import numpy as np
# diffusers
import requests
import torch
from io import BytesIO
from diffusers import StableDiffusionInpaintPipeline
from huggingface_hub import hf_hub_download
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import content.GSA.GroundingDINO.groundingdino as dino
def load_model_hf(repo_id, filename, ckpt_config_filename, device='cpu'):
cache_config_file = hf_hub_download(repo_id=repo_id, filename=ckpt_config_filename)
args = SLConfig.fromfile(cache_config_file)
args.device = device
# ✅ 방어 코드 추가
if not isinstance(args.text_encoder_type, str):
print("⚠️ text_encoder_type 잘못됨, 기본값으로 대체합니다.")
args.text_encoder_type = "bert-base-uncased"
model = build_model(args)
cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
checkpoint = torch.load(cache_file, map_location=device)
log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
print("Model loaded from {} \n => {}".format(cache_file, log))
_ = model.eval()
return model
ckpt_repo_id = "ShilongLiu/GroundingDINO"
ckpt_filename = "groundingdino_swinb_cogcoor.pth"
ckpt_config_filename = "GroundingDINO_SwinB.cfg.py"
groundingdino_model = load_model_hf(ckpt_repo_id, ckpt_filename, ckpt_config_filename, device)
sam_checkpoint = 'sam_vit_h_4b8939.pth'
sam_predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint).to(device))
sd_pipe = StableDiffusionInpaintPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-inpainting",
torch_dtype=torch.float16,
).to(device)
# Load image
def download_image(url, image_file_path):
r = requests.get(url, timeout=4.0)
if r.status_code != requests.codes.ok:
assert False, 'Status code error: {}.'.format(r.status_code)
with Image.open(BytesIO(r.content)) as im:
im.save(image_file_path)
print('Image downloaded from url: {} and saved to: {}.'.format(url, image_file_path))
# print(">>> text_encoder_type:", args.text_encoder_type, type(args.text_encoder_type))
local_image_path = "./image-3.jpg"
image_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
download_image(image_url, local_image_path)
image_source, image = load_image(local_image_path)
Image.fromarray(image_source)
# grounding DINO를 사용해 밴치를 찾아본다.
def detect(image, text_prompt, model, box_threshold = 0.3, text_threshold = 0.25):
boxes, logits, phrases = predict(
model=model,
image=image,
caption=text_prompt,
box_threshold=box_threshold,
text_threshold=text_threshold
)
annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
annotated_frame = annotated_frame[...,::-1] # BGR to RGB
return annotated_frame, boxes
annotated_frame, detected_boxes = detect(image, text_prompt="bench", model=groundingdino_model)
ret_box_image = Image.fromarray(annotated_frame)
ret_box_image.save("./ret_box_image.jpg")
def segment(image, sam_model, boxes):
sam_model.set_image(image)
H, W, _ = image.shape
boxes_xyxy = box_ops.box_cxcywh_to_xyxy(boxes) * torch.Tensor([W, H, W, H])
transformed_boxes = sam_model.transform.apply_boxes_torch(boxes_xyxy.to(device), image.shape[:2])
masks, _, _ = sam_model.predict_torch(
point_coords = None,
point_labels = None,
boxes = transformed_boxes,
multimask_output = False,
)
return masks.cpu()
def draw_mask(mask, image, random_color=True):
if random_color:
color = np.concatenate([np.random.random(3), np.array([0.8])], axis=0)
else:
color = np.array([30/255, 144/255, 255/255, 0.6])
h, w = mask.shape[-2:]
mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
annotated_frame_pil = Image.fromarray(image).convert("RGBA")
mask_image_pil = Image.fromarray((mask_image.cpu().numpy() * 255).astype(np.uint8)).convert("RGBA")
return np.array(Image.alpha_composite(annotated_frame_pil, mask_image_pil))
segmented_frame_masks = segment(image_source, sam_predictor, boxes=detected_boxes)
annotated_frame_with_mask = draw_mask(segmented_frame_masks[0][0], annotated_frame)
ret_box_mask_checking_image = Image.fromarray(annotated_frame_with_mask)
ret_box_mask_checking_image = ret_box_mask_checking_image.convert("RGB")
ret_box_mask_checking_image.save("./ret_box_mask_checking_image.jpg")
mask = segmented_frame_masks[0][0].cpu().numpy()
inverted_mask = ((1 - mask) * 255).astype(np.uint8)
image_source_pil = Image.fromarray(image_source)
image_mask_pil = Image.fromarray(mask)
inverted_image_mask_pil = Image.fromarray(inverted_mask)
image_source_pil.save("image_source.jpg", format="JPEG")
image_mask_pil.save("image_mask.jpg", format="JPEG")
inverted_image_mask_pil.save("inverted_image_mask.jpg", format="JPEG")
def generate_image(image, mask, prompt, negative_prompt, pipe, seed):
# resize for inpainting
w, h = image.size
in_image = image.resize((512, 512))
in_mask = mask.resize((512, 512))
generator = torch.Generator(device).manual_seed(seed)
result = pipe(image=in_image, mask_image=in_mask, prompt=prompt, negative_prompt=negative_prompt, generator=generator)
result = result.images[0]
return result.resize((w, h))
# 마스킹된 밴치를 소파로 변경해본다.
prompt="A sofa, high quality, detailed, cyberpunk, futuristic, with a lot of details, and a lot of colors."
negative_prompt="low resolution, ugly"
seed = 32 # for reproducibility
generated_image = generate_image(image=image_source_pil, mask=image_mask_pil, prompt=prompt, negative_prompt=negative_prompt, pipe=sd_pipe, seed=seed)
generated_image.save("generated_image.jpg", format="JPEG")
# 이번에는 반대로 배경을 해변으로 바꿔본다.
prompt="a beach with turquoise water, sand, and coconuts"
negative_prompt="people, low resolution, ugly"
seed = 32 # for reproducibility
generated_image2 = generate_image(image_source_pil, inverted_image_mask_pil, prompt, negative_prompt, sd_pipe, seed)
generated_image2.save("generated_image2.jpg", format="JPEG")
트러블 슈팅
맥북에서는 torchvision이 깔리지 않는다.
conda 가상환경을 만들어서 여기에 설치하여 실행했다.
2025.05.20 - [ICT 드림업 - 무물 매니저/개발] - Conda 가상환경 만들기
Conda 가상환경 만들기
파이썬 버전 찾기conda search python 가상환경 만들기conda create -n python=3.12.2 파이토치 설치하기conda install pytorch::pytorch torchvision torchaudio -c pytorch 실행하기source activate
kcw9609.tistory.com
---
sam파일이 비어있어서 에러가 생겼다.
import requests
url = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open("sam_vit_h_4b8939.pth", "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
위 코드로 직접 작성해줬다.
참고자료
03. Grounding DINO 실습
## 3.1 샘플 이미지 다운로드 인터넷에서 이미지를 다운로드하여 로컬에 저장하는 코드입니다. 다운로드한 이미지는 sample.jpg로 저장됩니다. ```python %cd …
wikidocs.net
https://ongamedev.tistory.com/531
GroundingDINO 설치 썰!
'썰!'이라고 쓴건 그냥 해프닝 정도라.. 그래도 기록으로 남겨두면 좋을 듯 해서 공유합니다. 브끄럽네요. 이미 web ui를 로컬에 설치해서 사용중이었으며, extension DDSD(https://github.com/NeoGraph-K/sd-webui
ongamedev.tistory.com
https://github.com/IDEA-Research/Grounded-Segment-Anything/tree/main/GroundingDINO
Grounded-Segment-Anything/GroundingDINO at main · IDEA-Research/Grounded-Segment-Anything
Grounded SAM: Marrying Grounding DINO with Segment Anything & Stable Diffusion & Recognize Anything - Automatically Detect , Segment and Generate Anything - IDEA-Research/Grounded-Segment-A...
github.com