import base64
import logging
from pathlib import Path
from encord.objects.classification import Classification
from encord.objects.classification_instance import ClassificationInstance
from encord.objects.ontology_labels_impl import LabelRowV2
from encord.objects.ontology_structure import OntologyStructure
from openai import OpenAI
from openai.types.chat import (
ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam,
ChatCompletionSystemMessageParam,
ChatCompletionUserMessageParam,
)
from openai.types.chat.chat_completion_content_part_image_param import ImageURL
from openai.types.chat.completion_create_params import ResponseFormat
from pydantic import BaseModel, Field
def to_image_completion_content_part(
image_path: Path,
) -> ChatCompletionContentPartImageParam:
"""
Convert an image path into a base64 encoding to be sent to gpt.
"""
with image_path.open("rb") as image_file:
content = base64.b64encode(image_file.read()).decode("utf-8")
return ChatCompletionContentPartImageParam(
image_url=ImageURL(url=f"data:image/jpeg;base64,{content}", detail="auto"),
type="image_url",
)
def get_ontology_classification(ontology: OntologyStructure) -> Classification:
"""
Replace <classification_name> with the name of the classification in your Ontology you want to update.
"""
return ontology.get_child_by_title("<classification_name>", type_=Classification)
"""
Below is an example of how to define a pydantic model for extracting text.
GPT also understands if you use list types and enums. For more examples,
have a look at these notebooks:
- [GPT-4o example with videos](http://colab.research.google.com/drive/1ctV-Zpoks7PDEXisVvpP1NeocyBkWXzp?usp=sharing)
- [Gemini 1.5 Pro with advanced pydantic models](http://colab.research.google.com/drive/1jeCCZrumLnCwdVHbn-wK46xUPQQ9KCtf?usp=sharing)
"""
class DescriptionModel(BaseModel):
description: str = Field(
min_length=25,
max_length=1000,
description="A detailed description of the scene",
)
def describe_scene(label_row: LabelRowV2, asset: Path) -> ClassificationInstance | None:
system_instruction = f"""
Your are an image analysis expert. Your task is to extract the most relevant description of the image content provided.
You are expected to only respond in the form of the following JSON schema.
```json
{DescriptionModel.model_json_schema()}
```
Ensure that you do not wrap the object in a list. Only a single object conforming to the JSON schema is allowed.
"""
client = OpenAI()
completion = client.chat.completions.create(
model="gpt-4o",
messages=[
ChatCompletionSystemMessageParam(role="system", content=system_instruction),
ChatCompletionUserMessageParam(
role="user",
content=[to_image_completion_content_part(asset)]
+ [
ChatCompletionContentPartTextParam(
type="text",
text="Please build a JSON object with respect to this visual data. Follow the JSON schema provided to fill in the schema as accurately as you can.",
)
],
),
],
response_format={"type": "json_object"},
max_tokens=1000,
)
raw_text = completion.choices[0].message.content
if raw_text is None:
logging.error("No response")
raise ValueError("Missing response from GPT-4o")
try:
labels = DescriptionModel.model_validate_json(raw_text)
except Exception:
logging.error(
"Unable to parse text",
)
logging.error(raw_text)
return None
ontology_classification = get_ontology_classification(label_row.ontology_structure)
instance = ontology_classification.create_instance()
instance.set_answer(labels.description)
return instance