Generate text from multimodal prompt

This sample demonstrates how to generate text from a multimodal prompt using the Gemini model. The prompt consists of three images and two text prompts. The model generates a text response that describes the images and the text prompts.

Explore further

For detailed documentation that includes this code sample, see the following:

Code sample

Go

Before trying this sample, follow the Go setup instructions in the Vertex AI quickstart using client libraries. For more information, see the Vertex AI Go API reference documentation.

To authenticate to Vertex AI, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.

import (
	"context"
	"fmt"
	"io"
	"log"
	"net/http"
	"net/url"
	"os"
	"strings"

	"cloud.google.com/go/vertexai/genai"
)

func main() {
	projectID := os.Getenv("GOOGLE_CLOUD_PROJECT")
	location := "us-central1"
	modelName := "gemini-1.5-flash-001"
	temperature := 0.4

	if projectID == "" {
		log.Fatal("require environment variable GOOGLE_CLOUD_PROJECT")
	}

	// construct this multimodal prompt:
	// [image of colosseum] city: Rome, Landmark: the Colosseum
	// [image of forbidden city]  city: Beijing, Landmark: the Forbidden City
	// [new image]

	// create prompt image parts
	// colosseum
	colosseum, err := partFromImageURL("https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark1.png")
	if err != nil {
		log.Fatalf("unable to read image: %v", err)
	}
	// forbidden city
	forbiddenCity, err := partFromImageURL("https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark2.png")
	if err != nil {
		log.Fatalf("unable to read image: %v", err)
	}
	// new image
	newImage, err := partFromImageURL("https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark3.png")
	if err != nil {
		log.Fatalf("unable to read image: %v", err)
	}

	// create a multimodal (multipart) prompt
	prompt := []genai.Part{
		colosseum,
		genai.Text("city: Rome, Landmark: the Colosseum "),
		forbiddenCity,
		genai.Text("city: Beijing, Landmark: the Forbidden City "),
		newImage,
	}

	// generate the response
	err = generateMultimodalContent(os.Stdout, prompt, projectID, location, modelName, float32(temperature))
	if err != nil {
		log.Fatalf("unable to generate: %v", err)
	}
}

// generateMultimodalContent provide a generated response using multimodal input
func generateMultimodalContent(w io.Writer, parts []genai.Part, projectID, location, modelName string, temperature float32) error {
	ctx := context.Background()

	client, err := genai.NewClient(ctx, projectID, location)
	if err != nil {
		log.Fatal(err)
	}
	defer client.Close()

	model := client.GenerativeModel(modelName)
	model.SetTemperature(temperature)

	res, err := model.GenerateContent(ctx, parts...)
	if err != nil {
		return fmt.Errorf("unable to generate contents: %v", err)
	}

	fmt.Fprintf(w, "generated response: %s\n", res.Candidates[0].Content.Parts[0])

	return nil
}

// partFromImageURL create a multimodal prompt part from an image URL
func partFromImageURL(image string) (genai.Part, error) {
	var img genai.Blob

	imageURL, err := url.Parse(image)
	if err != nil {
		return img, err
	}
	res, err := http.Get(image)
	if err != nil || res.StatusCode != 200 {
		return img, err
	}
	defer res.Body.Close()
	data, err := io.ReadAll(res.Body)
	if err != nil {
		return img, fmt.Errorf("unable to read from http: %v", err)
	}

	position := strings.LastIndex(imageURL.Path, ".")
	if position == -1 {
		return img, fmt.Errorf("couldn't find a period to indicate a file extension")
	}
	ext := imageURL.Path[position+1:]

	img = genai.ImageData(ext, data)
	return img, nil
}

Python

Before trying this sample, follow the Python setup instructions in the Vertex AI quickstart using client libraries. For more information, see the Vertex AI Python API reference documentation.

To authenticate to Vertex AI, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.

import vertexai

from vertexai.generative_models import GenerativeModel, Part

# TODO(developer): Update and un-comment below line
# project_id = "PROJECT_ID"

vertexai.init(project=project_id, location="us-central1")

# Load images from Cloud Storage URI
image_file1 = Part.from_uri(
    "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png",
    mime_type="image/png",
)
image_file2 = Part.from_uri(
    "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark2.png",
    mime_type="image/png",
)
image_file3 = Part.from_uri(
    "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark3.png",
    mime_type="image/png",
)

model = GenerativeModel(model_name="gemini-1.5-flash-001")
response = model.generate_content(
    [
        image_file1,
        "city: Rome, Landmark: the Colosseum",
        image_file2,
        "city: Beijing, Landmark: Forbidden City",
        image_file3,
    ]
)
print(response.text)

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser.