Tetap teratur dengan koleksi
Simpan dan kategorikan konten berdasarkan preferensi Anda.
Template WordCount adalah pipeline batch yang membaca teks dari Cloud Storage, membuat token baris teks menjadi kata individual, dan menjalankan penghitungan frekuensi pada setiap kata. Untuk mengetahui informasi selengkapnya tentang WordCount, lihat Pipeline Contoh WordCount.
Jika bucket Cloud Storage berada di luar perimeter layanan Anda, buat aturan keluar yang mengizinkan akses ke bucket tersebut.
REGION_NAME:
region tempat Anda ingin
men-deploy tugas Dataflow—misalnya, us-central1
BUCKET_NAME: nama bucket Cloud Storage Anda
API
Untuk menjalankan template menggunakan REST API, kirim permintaan HTTP POST. Untuk mengetahui informasi selengkapnya tentang
API dan cakupan otorisasinya, lihat
projects.templates.launch.
PROJECT_ID:
ID Google Cloud project tempat Anda ingin menjalankan tugas Dataflow
JOB_NAME:
nama tugas unik pilihan Anda
LOCATION:
region tempat Anda ingin
men-deploy tugas Dataflow—misalnya, us-central1
BUCKET_NAME: nama bucket Cloud Storage Anda
Kode sumber template
Java
/* * Copyright (C) 2016 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */packagecom.google.cloud.teleport.templates;importcom.google.cloud.teleport.metadata.Template;importcom.google.cloud.teleport.metadata.TemplateCategory;importcom.google.cloud.teleport.metadata.TemplateParameter;importcom.google.cloud.teleport.templates.WordCount.WordCountOptions;importorg.apache.beam.sdk.Pipeline;importorg.apache.beam.sdk.io.TextIO;importorg.apache.beam.sdk.metrics.Counter;importorg.apache.beam.sdk.metrics.Metrics;importorg.apache.beam.sdk.options.PipelineOptions;importorg.apache.beam.sdk.options.PipelineOptionsFactory;importorg.apache.beam.sdk.options.ValueProvider;importorg.apache.beam.sdk.transforms.Count;importorg.apache.beam.sdk.transforms.DoFn;importorg.apache.beam.sdk.transforms.MapElements;importorg.apache.beam.sdk.transforms.PTransform;importorg.apache.beam.sdk.transforms.ParDo;importorg.apache.beam.sdk.transforms.SimpleFunction;importorg.apache.beam.sdk.values.KV;importorg.apache.beam.sdk.values.PCollection;/** * A template that counts words in text files. * * <p>Check out <a * href="https://github.com/GoogleCloudPlatform/DataflowTemplates/blob/main/v1/README_Word_Count.md">README</a> * for instructions on how to use or modify this template. */@Template(name="Word_Count",category=TemplateCategory.GET_STARTED,displayName="Word Count",description="Batch pipeline. Reads text from Cloud Storage, tokenizes text lines into individual words,"+" and performs frequency count on each of the words.",optionsClass=WordCountOptions.class,contactInformation="https://cloud.google.com/support")publicclassWordCount{staticclassExtractWordsFnextendsDoFn<String,String>{privatefinalCounteremptyLines=Metrics.counter(ExtractWordsFn.class,"emptyLines");@ProcessElementpublicvoidprocessElement(ProcessContextc){// Check if the line is empty.if(c.element().trim().isEmpty()){emptyLines.inc();return;}// Split the line into words.String[]words=c.element().split("[^a-zA-Z']+");// Output each word encountered into the output PCollection.for(Stringword:words){if(!word.isEmpty()){c.output(word);}}}}/** A SimpleFunction that converts a Word and Count into a printable string. */publicstaticclassFormatAsTextFnextendsSimpleFunction<KV<String,Long>,String>{@OverridepublicStringapply(KV<String,Long>input){returninput.getKey()+": "+input.getValue();}}/** * A PTransform that converts a PCollection containing lines of text into a PCollection of * formatted word counts. */publicstaticclassCountWordsextendsPTransform<PCollection<String>,PCollection<KV<String,Long>>>{@OverridepublicPCollection<KV<String,Long>>expand(PCollection<String>lines){// Convert lines of text into individual words.PCollection<String>words=lines.apply(ParDo.of(newExtractWordsFn()));// Count the number of times each word occurs.PCollection<KV<String,Long>>wordCounts=words.apply(Count.<String>perElement());returnwordCounts;}}/** * Options supported by {@link com.google.cloud.teleport.templates.WordCount}. * * <p>Inherits standard configuration options. */publicinterfaceWordCountOptionsextendsPipelineOptions{@TemplateParameter.GcsReadFile(order=1,description="Input file(s) in Cloud Storage",helpText="The input file pattern Dataflow reads from. Use the example file "+"(gs://dataflow-samples/shakespeare/kinglear.txt) or enter the path to your own "+"using the same format: gs://your-bucket/your-file.txt")ValueProvider<String>getInputFile();voidsetInputFile(ValueProvider<String>value);@TemplateParameter.GcsWriteFolder(order=2,description="Output Cloud Storage file prefix",helpText="Path and filename prefix for writing output files. Ex: gs://your-bucket/counts")ValueProvider<String>getOutput();voidsetOutput(ValueProvider<String>value);}publicstaticvoidmain(String[]args){WordCountOptionsoptions=PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class);Pipelinep=Pipeline.create(options);p.apply("ReadLines",TextIO.read().from(options.getInputFile())).apply(newCountWords()).apply(MapElements.via(newFormatAsTextFn())).apply("WriteCounts",TextIO.write().to(options.getOutput()));p.run();}}
[[["Mudah dipahami","easyToUnderstand","thumb-up"],["Memecahkan masalah saya","solvedMyProblem","thumb-up"],["Lainnya","otherUp","thumb-up"]],[["Sulit dipahami","hardToUnderstand","thumb-down"],["Informasi atau kode contoh salah","incorrectInformationOrSampleCode","thumb-down"],["Informasi/contoh yang saya butuhkan tidak ada","missingTheInformationSamplesINeed","thumb-down"],["Masalah terjemahan","translationIssue","thumb-down"],["Lainnya","otherDown","thumb-down"]],["Terakhir diperbarui pada 2025-08-18 UTC."],[[["\u003cp\u003eThe WordCount template is a batch pipeline that analyzes text from Cloud Storage, breaks down lines into words, and counts the frequency of each word.\u003c/p\u003e\n"],["\u003cp\u003eTo run the WordCount template, you must specify the input file path in Cloud Storage and the output file path and prefix, both within Cloud Storage.\u003c/p\u003e\n"],["\u003cp\u003eThe WordCount template can be launched via the Dataflow console, using the \u003ccode\u003egcloud\u003c/code\u003e command-line tool, or through the REST API with a specified job name, region, and parameters.\u003c/p\u003e\n"],["\u003cp\u003eThe WordCount template requires creating an egress rule if your Cloud Storage bucket is outside of your service perimeter.\u003c/p\u003e\n"],["\u003cp\u003eThe WordCount template's core functionality involves extracting words, counting their occurrences, formatting the counts, and writing the results to a specified Cloud Storage location.\u003c/p\u003e\n"]]],[],null,["# Run a sample template\n\nThe WordCount template is a batch pipeline that reads text from Cloud Storage, tokenizes the text lines into individual words, and performs a frequency count on each of the words. For more information about WordCount, see [WordCount Example Pipeline](/dataflow/examples/wordcount-example).\n\nIf the Cloud Storage bucket is outside of your [service perimeter](/vpc-service-controls/docs/overview), create an [egress rule](/vpc-service-controls/docs/ingress-egress-rules) that allows access to the bucket.\n\nTemplate parameters\n-------------------\n\nRun the WordCount template\n--------------------------\n\n### Console\n\n1. Go to the Dataflow **Create job from template** page.\n[Go to Create job from template](https://console.cloud.google.com/dataflow/createjob)\n2. In the **Job name** field, enter a unique job name.\n3. Optional: For **Regional endpoint** , select a value from the drop-down menu. The default region is `us-central1`.\n\n\n For a list of regions where you can run a Dataflow job, see\n [Dataflow locations](/dataflow/docs/resources/locations).\n4. From the **Dataflow template** drop-down menu, select the WordCount template.\n5. In the provided parameter fields, enter your parameter values.\n6. Click **Run job**.\n\n### gcloud\n\n\n| **Note:** To use the Google Cloud CLI to run classic templates, you must have [Google Cloud CLI](/sdk/docs/install) version 138.0.0 or later.\n\nIn your shell or terminal, run the template:\n\n\u003cbr /\u003e\n\n gcloud dataflow jobs run JOB_NAME \\\n --gcs-location gs://dataflow-templates/latest/Word_Count \\\n --region REGION_NAME \\\n --parameters \\\n inputFile=gs://dataflow-samples/shakespeare/kinglear.txt,output=gs://BUCKET_NAME/output/my_output\n\n\nReplace the following:\n\n- `JOB_NAME`:\n a unique job name of your choice\n\n- `REGION_NAME`:\n the [region](/dataflow/docs/resources/locations) where you want to\n deploy your Dataflow job---for example, `us-central1`\n\n- `BUCKET_NAME`: the name of your Cloud Storage bucket\n\n\u003cbr /\u003e\n\n### API\n\n\nTo run the template using the REST API, send an HTTP POST request. For more information on the\nAPI and its authorization scopes, see\n[`projects.templates.launch`](/dataflow/docs/reference/rest/v1b3/projects.templates/launch).\n\n\u003cbr /\u003e\n\n POST https://dataflow.googleapis.com/v1b3/projects/PROJECT_ID/locations/LOCATION/templates:launch?gcsPath=gs://dataflow-templates/latest/Word_Count\n {\n \"jobName\": \"JOB_NAME\",\n \"parameters\": {\n \"inputFile\" : \"gs://dataflow-samples/shakespeare/kinglear.txt\",\n \"output\": \"gs://BUCKET_NAME/output/my_output\"\n },\n \"environment\": { \"zone\": \"us-central1-f\" }\n }\n\n\nReplace the following:\n\n- `PROJECT_ID`: the Google Cloud project ID where you want to run the Dataflow job\n\n\u003c!-- --\u003e\n\n- `JOB_NAME`:\n a unique job name of your choice\n\n- `LOCATION`:\n the [region](/dataflow/docs/resources/locations) where you want to\n deploy your Dataflow job---for example, `us-central1`\n\n- `BUCKET_NAME`: the name of your Cloud Storage bucket\n\n\u003cbr /\u003e\n\n### Template source code\n\n### Java\n\n /*\n * Copyright (C) 2016 Google LLC\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n * use this file except in compliance with the License. You may obtain a copy of\n * the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n * License for the specific language governing permissions and limitations under\n * the License.\n */\n package com.google.cloud.teleport.templates;\n\n import com.google.cloud.teleport.metadata.Template;\n import com.google.cloud.teleport.metadata.TemplateCategory;\n import com.google.cloud.teleport.metadata.TemplateParameter;\n import com.google.cloud.teleport.templates.WordCount.WordCountOptions;\n import org.apache.beam.sdk.Pipeline;\n import org.apache.beam.sdk.io.TextIO;\n import org.apache.beam.sdk.metrics.Counter;\n import org.apache.beam.sdk.metrics.Metrics;\n import org.apache.beam.sdk.options.PipelineOptions;\n import org.apache.beam.sdk.options.PipelineOptionsFactory;\n import org.apache.beam.sdk.options.ValueProvider;\n import org.apache.beam.sdk.transforms.Count;\n import org.apache.beam.sdk.transforms.DoFn;\n import org.apache.beam.sdk.transforms.MapElements;\n import org.apache.beam.sdk.transforms.PTransform;\n import org.apache.beam.sdk.transforms.ParDo;\n import org.apache.beam.sdk.transforms.SimpleFunction;\n import org.apache.beam.sdk.values.KV;\n import org.apache.beam.sdk.values.PCollection;\n\n /**\n * A template that counts words in text files.\n *\n * \u003cp\u003eCheck out \u003ca\n * href=\"https://github.com/GoogleCloudPlatform/DataflowTemplates/blob/main/v1/README_Word_Count.md\"\u003eREADME\u003c/a\u003e\n * for instructions on how to use or modify this template.\n */\n @Template(\n name = \"Word_Count\",\n category = TemplateCategory.GET_STARTED,\n displayName = \"Word Count\",\n description =\n \"Batch pipeline. Reads text from Cloud Storage, tokenizes text lines into individual words,\"\n + \" and performs frequency count on each of the words.\",\n optionsClass = WordCountOptions.class,\n contactInformation = \"https://cloud.google.com/support\")\n public class WordCount {\n\n static class ExtractWordsFn extends DoFn\u003cString, String\u003e {\n private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, \"emptyLines\");\n\n @ProcessElement\n public void processElement(ProcessContext c) {\n // Check if the line is empty.\n if (c.element().trim().isEmpty()) {\n emptyLines.inc();\n return;\n }\n\n // Split the line into words.\n String[] words = c.element().split(\"[^a-zA-Z']+\");\n\n // Output each word encountered into the output PCollection.\n for (String word : words) {\n if (!word.isEmpty()) {\n c.output(word);\n }\n }\n }\n }\n\n /** A SimpleFunction that converts a Word and Count into a printable string. */\n public static class FormatAsTextFn extends SimpleFunction\u003cKV\u003cString, Long\u003e, String\u003e {\n @Override\n public String apply(KV\u003cString, Long\u003e input) {\n return input.getKey() + \": \" + input.getValue();\n }\n }\n\n /**\n * A PTransform that converts a PCollection containing lines of text into a PCollection of\n * formatted word counts.\n */\n public static class CountWords\n extends PTransform\u003cPCollection\u003cString\u003e, PCollection\u003cKV\u003cString, Long\u003e\u003e\u003e {\n @Override\n public PCollection\u003cKV\u003cString, Long\u003e\u003e expand(PCollection\u003cString\u003e lines) {\n\n // Convert lines of text into individual words.\n PCollection\u003cString\u003e words = lines.apply(ParDo.of(new ExtractWordsFn()));\n\n // Count the number of times each word occurs.\n PCollection\u003cKV\u003cString, Long\u003e\u003e wordCounts = words.apply(Count.\u003cString\u003eperElement());\n\n return wordCounts;\n }\n }\n\n /**\n * Options supported by {@link com.google.cloud.teleport.templates.WordCount}.\n *\n * \u003cp\u003eInherits standard configuration options.\n */\n public interface WordCountOptions extends PipelineOptions {\n\n @TemplateParameter.GcsReadFile(\n order = 1,\n description = \"Input file(s) in Cloud Storage\",\n helpText =\n \"The input file pattern Dataflow reads from. Use the example file \"\n + \"(gs://dataflow-samples/shakespeare/kinglear.txt) or enter the path to your own \"\n + \"using the same format: gs://your-bucket/your-file.txt\")\n ValueProvider\u003cString\u003e getInputFile();\n\n void setInputFile(ValueProvider\u003cString\u003e value);\n\n @TemplateParameter.GcsWriteFolder(\n order = 2,\n description = \"Output Cloud Storage file prefix\",\n helpText = \"Path and filename prefix for writing output files. Ex: gs://your-bucket/counts\")\n ValueProvider\u003cString\u003e getOutput();\n\n void setOutput(ValueProvider\u003cString\u003e value);\n }\n\n public static void main(String[] args) {\n WordCountOptions options =\n PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class);\n Pipeline p = Pipeline.create(options);\n p.apply(\"ReadLines\", TextIO.read().from(options.getInputFile()))\n .apply(new CountWords())\n .apply(MapElements.via(new FormatAsTextFn()))\n .apply(\"WriteCounts\", TextIO.write().to(options.getOutput()));\n\n p.run();\n }\n }"]]