Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions genai/code_execution/codeexecution_annotateimage_with_txt_gcsimg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def generate_content() -> bool:
# [START googlegenaisdk_codeexecution_annotateimage_with_txt_gcsimg]
import io
from PIL import Image
from google import genai
from google.genai import types

client = genai.Client()

response = client.models.generate_content(
model="gemini-3-flash-preview",
contents=[
types.Part.from_uri(
file_uri="https://storage.googleapis.com/cloud-samples-data/generative-ai/image/robotic.jpeg",
mime_type="image/png",
),
"Annotate on the image with arrows of different colors, which object should go into which bin.",
],
config=types.GenerateContentConfig(tools=[types.Tool(code_execution=types.ToolCodeExecution)]),
)

img_count = 0
for part in response.candidates[0].content.parts:
if part.text is not None:
print(part.text)
if part.executable_code is not None:
print("####################### 1. Generate Python Code #######################")
print(part.executable_code.code)
if part.code_execution_result is not None:
print("####################### 2. Executing Python Code #######################")
print(part.code_execution_result.output)
# For local executions, save the output to a local filename
if part.as_image() is not None:
print("####################### 3. Save Output #######################")
img_count += 1
output_location = f"sample_images/output-annotate-image-{img_count}.jpg"
image_data = part.as_image().image_bytes
image = Image.open(io.BytesIO(image_data))
image = image.convert("RGB")
image.save(output_location)
print(f"Output is saved to {output_location}")
# Example response:
# ####################### 1. Generate Python Code #######################
# import PIL.Image
# import PIL.ImageDraw
#
# # Load the image to get dimensions
# img = PIL.Image.open('f_https___storage.googleapis.com_cloud_samples_data_generative_ai_image_robotic.jpeg')
# width, height = img.size
#
# # Define objects and bins with normalized coordinates [ymin, xmin, ymax, xmax]
# bins = {
# 'light_blue': [118, 308, 338, 436],
# 'green': [248, 678, 458, 831],
# 'black': [645, 407, 898, 578]
# }
#
# objects = [
# {'name': 'green pepper', 'box': [256, 482, 296, 546], 'target': 'green'},
# {'name': 'red pepper', 'box': [317, 478, 349, 544], 'target': 'green'},
# {'name': 'grapes', 'box': [584, 555, 664, 593], 'target': 'green'},
# {'name': 'cherries', 'box': [463, 671, 511, 718], 'target': 'green'},
# {'name': 'soda can', 'box': [397, 524, 489, 605], 'target': 'light_blue'},
# {'name': 'brown snack', 'box': [397, 422, 475, 503], 'target': 'black'},
# {'name': 'welch snack', 'box': [520, 466, 600, 543], 'target': 'black'},
# {'name': 'paper towel', 'box': [179, 564, 250, 607], 'target': 'black'},
# {'name': 'plastic cup', 'box': [271, 587, 346, 643], 'target': 'black'},
# ]
#
# # Helper to get center of a normalized box
# def get_center(box):
# ymin, xmin, ymax, xmax = box
# return ((xmin + xmax) / 2000 * width, (ymin + ymax) / 2000 * height)
#
# draw = PIL.ImageDraw.Draw(img)
#
# # Define arrow colors based on target bin
# colors = {
# 'green': 'green',
# 'light_blue': 'blue',
# 'black': 'red'
# }
#
# for obj in objects:
# start_point = get_center(obj['box'])
# end_point = get_center(bins[obj['target']])
# color = colors[obj['target']]
# # Drawing a line with an arrow head (simulated with a few extra lines)
# draw.line([start_point, end_point], fill=color, width=5)
# # Simple arrowhead
# import math
# angle = math.atan2(end_point[1] - start_point[1], end_point[0] - start_point[0])
# arrow_len = 20
# p1 = (end_point[0] - arrow_len * math.cos(angle - math.pi / 6),
# end_point[1] - arrow_len * math.sin(angle - math.pi / 6))
# p2 = (end_point[0] - arrow_len * math.cos(angle + math.pi / 6),
# end_point[1] - arrow_len * math.sin(angle + math.pi / 6))
# draw.line([end_point, p1], fill=color, width=5)
# draw.line([end_point, p2], fill=color, width=5)
#
# img.save('annotated_robotic.jpeg')
#
# # Also list detections for confirmation
# # [
# # {"box_2d": [118, 308, 338, 436], "label": "light blue bin"},
# # {"box_2d": [248, 678, 458, 831], "label": "green bin"},
# # {"box_2d": [645, 407, 898, 578], "label": "black bin"},
# # {"box_2d": [256, 482, 296, 546], "label": "green pepper"},
# # {"box_2d": [317, 478, 349, 544], "label": "red pepper"},
# # {"box_2d": [584, 555, 664, 593], "label": "grapes"},
# # {"box_2d": [463, 671, 511, 718], "label": "cherries"},
# # {"box_2d": [397, 524, 489, 605], "label": "soda can"},
# # {"box_2d": [397, 422, 475, 503], "label": "brown snack"},
# # {"box_2d": [520, 466, 600, 543], "label": "welch snack"},
# # {"box_2d": [179, 564, 250, 607], "label": "paper towel"},
# # {"box_2d": [271, 587, 346, 643], "label": "plastic cup"}
# # ]
#
# ####################### 2. Executing Python Code #######################
# None
# ####################### 3. Save Output #######################
# Output is saved to sample_images/output-annotate-image-1.jpg
# The image has been annotated with arrows indicating the appropriate bin for each object based on standard waste sorting practices:
#
# - **Green Arrows (Compost):** Organic items such as the green pepper, red pepper, grapes, and cherries are directed to the **green bin**.
# - **Blue Arrow (Recycling):** The crushed soda can is directed to the **light blue bin**.
# - **Red Arrows (Trash/Landfill):** Non-recyclable or contaminated items like the snack wrappers (brown and Welch's), the white paper towel, and the small plastic cup are directed to the **black bin**.
#
# These categorizations follow common sorting rules where green is for organics, blue for recyclables, and black for general waste.
# [END googlegenaisdk_codeexecution_annotateimage_with_txt_gcsimg]
return True


if __name__ == "__main__":
generate_content()
158 changes: 158 additions & 0 deletions genai/code_execution/codeexecution_barplot_with_txt_img.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def generate_content() -> bool:
# [START googlegenaisdk_codeexecution_barplot_with_txt_img]
import io
from PIL import Image
from google import genai
from google.genai import types

# Read a local image as input
image_pil = Image.open("sample_images/tabular_data.png")
image_pil = image_pil.convert("RGB")
byte_io = io.BytesIO()
image_pil.save(byte_io, format="JPEG")
image_bytes = byte_io.getvalue()
image = types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")

client = genai.Client()

response = client.models.generate_content(
model="gemini-3-flash-preview",
contents=[
image,
"Make a bar chart of per-category performance, normalize prior SOTA as 1.0 for each task,"
"then take average per-category. Plot using matplotlib with nice style.",
],
config=types.GenerateContentConfig(tools=[types.Tool(code_execution=types.ToolCodeExecution)]),
)

img_count = 0
for part in response.candidates[0].content.parts:
if part.text is not None:
print(part.text)
if part.executable_code is not None:
print("####################### 1. Generate Python Code #######################")
print(part.executable_code.code)
if part.code_execution_result is not None:
print("####################### 2. Executing Python Code #######################")
print(part.code_execution_result.output)
# For local executions, save the output to a local filename
if part.as_image() is not None:
print("####################### 3. Save Output #######################")
img_count += 1
output_location = f"sample_images/output-barplot-{img_count}.jpg"
image_data = part.as_image().image_bytes
image = Image.open(io.BytesIO(image_data))
image = image.convert("RGB")
image.save(output_location)
print(f"Output is saved to {output_location}")
# Example response:
# ####################### 1. Generate Python Code #######################
# import matplotlib.pyplot as plt
# import numpy as np
#
# data = [
# # Category, Benchmark, G3P, G2.5P, C4.5, GPT5.1, lower_is_better
# ("Visual Reasoning", "MMMU Pro", 81.0, 68.0, 72.0, 76.0, False),
# ("Visual Reasoning", "VLMsAreBiased", 50.6, 24.3, 32.7, 21.7, False),
# ("Document", "CharXiv Reasoning", 81.4, 69.6, 67.2, 69.5, False),
# ("Document", "OmniDocBench1.5*", 0.115, 0.145, 0.120, 0.147, True),
# ("Spatial", "ERQA", 70.5, 56.0, 51.3, 60.0, False),
# ("Spatial", "Point-Bench", 85.5, 62.7, 38.5, 41.8, False),
# ("Spatial", "RefSpatial", 65.5, 33.6, 19.5, 28.2, False),
# ("Spatial", "CV-Bench", 92.0, 85.9, 83.8, 84.6, False),
# ("Spatial", "MindCube", 77.7, 57.5, 58.5, 61.7, False),
# ("Screen", "ScreenSpot Pro", 72.7, 11.4, 49.9, 3.50, False),
# ("Screen", "Gui-World QA", 68.0, 42.8, 44.9, 38.7, False),
# ("Video", "Video-MMMU", 87.6, 83.6, 84.4, 80.4, False),
# ("Video", "Video-MME", 88.4, 86.9, 84.1, 86.3, False),
# ("Video", "1H-VideoQA", 81.8, 79.4, 52.0, 61.5, False),
# ("Video", "Perception Test", 80.0, 78.4, 74.1, 77.8, False),
# ("Video", "YouCook2", 222.7, 188.3, 145.8, 132.4, False),
# ("Video", "Vatex", 77.4, 71.3, 60.1, 62.9, False),
# ("Video", "Motion Bench", 70.3, 66.3, 65.9, 61.1, False),
# ("Education", "Math Kangaroo", 84.4, 77.4, 68.9, 79.9, False),
# ("Biomedical", "MedXpertQA-MM", 77.8, 65.9, 62.2, 65.5, False),
# ("Biomedical", "VQA-RAD", 81.9, 71.4, 76.0, 72.2, False),
# ("Biomedical", "MicroVQA", 68.8, 63.5, 61.4, 61.5, False),
# ]
#
# normalized_scores = []
# for cat, bench, g3p, g25p, c45, gpt, lib in data:
# others = [g25p, c45, gpt]
# if lib:
# sota = min(others)
# norm_score = sota / g3p
# else:
# sota = max(others)
# norm_score = g3p / sota
# normalized_scores.append((cat, norm_score))
#
# categories = {}
# for cat, score in normalized_scores:
# if cat not in categories:
# categories[cat] = []
# categories[cat].append(score)
#
# avg_per_category = {cat: np.mean(scores) for cat, scores in categories.items()}
#
# # Plotting
# cats = list(avg_per_category.keys())
# values = [avg_per_category[c] for c in cats]
#
# # Sort categories for better visualization if needed, or keep order from data
# plt.figure(figsize=(10, 6))
# plt.style.use('ggplot')
# bars = plt.bar(cats, values, color='skyblue', edgecolor='navy')
#
# plt.axhline(y=1.0, color='red', linestyle='--', label='Prior SOTA (1.0)')
# plt.ylabel('Normalized Performance (SOTA = 1.0)')
# plt.title('Gemini 3 Pro Performance relative to Prior SOTA (Normalized)', fontsize=14)
# plt.xticks(rotation=45, ha='right')
# plt.ylim(0, max(values) * 1.2)
#
# for bar in bars:
# yval = bar.get_height()
# plt.text(bar.get_x() + bar.get_width()/2, yval + 0.02, f'{yval:.2f}x', ha='center', va='bottom')
#
# plt.legend()
# plt.tight_layout()
# plt.savefig('performance_chart.png')
# plt.show()
#
# print(avg_per_category)
#
# ####################### 2. Executing Python Code #######################
# {'Visual Reasoning': np.float64(1.3065950426525028), 'Document': np.float64(1.1065092453773113), 'Spatial': np.float64(1.3636746436001959), 'Screen': np.float64(1.4856952211773211), 'Video': np.float64(1.0620548283943443), 'Education': np.float64(1.0563204005006257), 'Biomedical': np.float64(1.1138909257119955)}
#
# ####################### 3. Save Output #######################
# Output is saved to sample_images/output-barplot-1.jpg
# ####################### 3. Save Output #######################
# Output is saved to sample_images/output-barplot-2.jpg
# Based on the data provided in the table, I have calculated the per-category performance of Gemini 3 Pro normalized against the prior state-of-the-art (SOTA), which is defined as the best performance among Gemini 2.5 Pro, Claude Opus 4.5, and GPT-5.1 for each benchmark.
#
# For benchmarks where lower values are better (indicated by an asterisk, e.g., OmniDocBench1.5*), the normalization was calculated as $\text{Prior SOTA} / \text{Gemini 3 Pro Score}$. For all other benchmarks, it was calculated as $\text{Gemini 3 Pro Score} / \text{Prior SOTA}$. The values were then averaged within each category.
#
# The resulting bar chart below shows that Gemini 3 Pro outperforms the prior SOTA across all categories, with the most significant gains in **Screen** (1.49x), **Spatial** (1.36x), and **Visual Reasoning** (1.31x) benchmarks.
#
# ![Gemini 3 Pro Performance Chart](performance_chart.png)
# [END googlegenaisdk_codeexecution_barplot_with_txt_img]
return True


if __name__ == "__main__":
generate_content()
Loading