/* * Copyright (C) 2016 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */packagecom.google.cloud.teleport.templates;importcom.google.cloud.teleport.metadata.Template;importcom.google.cloud.teleport.metadata.TemplateCategory;importcom.google.cloud.teleport.metadata.TemplateParameter;importcom.google.cloud.teleport.templates.WordCount.WordCountOptions;importorg.apache.beam.sdk.Pipeline;importorg.apache.beam.sdk.io.TextIO;importorg.apache.beam.sdk.metrics.Counter;importorg.apache.beam.sdk.metrics.Metrics;importorg.apache.beam.sdk.options.PipelineOptions;importorg.apache.beam.sdk.options.PipelineOptionsFactory;importorg.apache.beam.sdk.options.ValueProvider;importorg.apache.beam.sdk.transforms.Count;importorg.apache.beam.sdk.transforms.DoFn;importorg.apache.beam.sdk.transforms.MapElements;importorg.apache.beam.sdk.transforms.PTransform;importorg.apache.beam.sdk.transforms.ParDo;importorg.apache.beam.sdk.transforms.SimpleFunction;importorg.apache.beam.sdk.values.KV;importorg.apache.beam.sdk.values.PCollection;/** * A template that counts words in text files. * * <p>Check out <a * href="https://github.com/GoogleCloudPlatform/DataflowTemplates/blob/main/v1/README_Word_Count.md">README</a> * for instructions on how to use or modify this template. */@Template(name="Word_Count",category=TemplateCategory.GET_STARTED,displayName="Word Count",description="Batch pipeline. Reads text from Cloud Storage, tokenizes text lines into individual words,"+" and performs frequency count on each of the words.",optionsClass=WordCountOptions.class,contactInformation="https://cloud.google.com/support")publicclassWordCount{staticclassExtractWordsFnextendsDoFn<String,String>{privatefinalCounteremptyLines=Metrics.counter(ExtractWordsFn.class,"emptyLines");@ProcessElementpublicvoidprocessElement(ProcessContextc){// Check if the line is empty.if(c.element().trim().isEmpty()){emptyLines.inc();return;}// Split the line into words.String[]words=c.element().split("[^a-zA-Z']+");// Output each word encountered into the output PCollection.for(Stringword:words){if(!word.isEmpty()){c.output(word);}}}}/** A SimpleFunction that converts a Word and Count into a printable string. */publicstaticclassFormatAsTextFnextendsSimpleFunction<KV<String,Long>,String>{@OverridepublicStringapply(KV<String,Long>input){returninput.getKey()+": "+input.getValue();}}/** * A PTransform that converts a PCollection containing lines of text into a PCollection of * formatted word counts. */publicstaticclassCountWordsextendsPTransform<PCollection<String>,PCollection<KV<String,Long>>>{@OverridepublicPCollection<KV<String,Long>>expand(PCollection<String>lines){// Convert lines of text into individual words.PCollection<String>words=lines.apply(ParDo.of(newExtractWordsFn()));// Count the number of times each word occurs.PCollection<KV<String,Long>>wordCounts=words.apply(Count.<String>perElement());returnwordCounts;}}/** * Options supported by {@link com.google.cloud.teleport.templates.WordCount}. * * <p>Inherits standard configuration options. */publicinterfaceWordCountOptionsextendsPipelineOptions{@TemplateParameter.GcsReadFile(order=1,description="Input file(s) in Cloud Storage",helpText="The input file pattern Dataflow reads from. Use the example file "+"(gs://dataflow-samples/shakespeare/kinglear.txt) or enter the path to your own "+"using the same format: gs://your-bucket/your-file.txt")ValueProvider<String>getInputFile();voidsetInputFile(ValueProvider<String>value);@TemplateParameter.GcsWriteFolder(order=2,description="Output Cloud Storage file prefix",helpText="Path and filename prefix for writing output files. Ex: gs://your-bucket/counts")ValueProvider<String>getOutput();voidsetOutput(ValueProvider<String>value);}publicstaticvoidmain(String[]args){WordCountOptionsoptions=PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class);Pipelinep=Pipeline.create(options);p.apply("ReadLines",TextIO.read().from(options.getInputFile())).apply(newCountWords()).apply(MapElements.via(newFormatAsTextFn())).apply("WriteCounts",TextIO.write().to(options.getOutput()));p.run();}}
[[["易于理解","easyToUnderstand","thumb-up"],["解决了我的问题","solvedMyProblem","thumb-up"],["其他","otherUp","thumb-up"]],[["很难理解","hardToUnderstand","thumb-down"],["信息或示例代码不正确","incorrectInformationOrSampleCode","thumb-down"],["没有我需要的信息/示例","missingTheInformationSamplesINeed","thumb-down"],["翻译问题","translationIssue","thumb-down"],["其他","otherDown","thumb-down"]],["最后更新时间 (UTC):2025-08-18。"],[[["\u003cp\u003eThe WordCount template is a batch pipeline that analyzes text from Cloud Storage, breaks down lines into words, and counts the frequency of each word.\u003c/p\u003e\n"],["\u003cp\u003eTo run the WordCount template, you must specify the input file path in Cloud Storage and the output file path and prefix, both within Cloud Storage.\u003c/p\u003e\n"],["\u003cp\u003eThe WordCount template can be launched via the Dataflow console, using the \u003ccode\u003egcloud\u003c/code\u003e command-line tool, or through the REST API with a specified job name, region, and parameters.\u003c/p\u003e\n"],["\u003cp\u003eThe WordCount template requires creating an egress rule if your Cloud Storage bucket is outside of your service perimeter.\u003c/p\u003e\n"],["\u003cp\u003eThe WordCount template's core functionality involves extracting words, counting their occurrences, formatting the counts, and writing the results to a specified Cloud Storage location.\u003c/p\u003e\n"]]],[],null,["# Run a sample template\n\nThe WordCount template is a batch pipeline that reads text from Cloud Storage, tokenizes the text lines into individual words, and performs a frequency count on each of the words. For more information about WordCount, see [WordCount Example Pipeline](/dataflow/examples/wordcount-example).\n\nIf the Cloud Storage bucket is outside of your [service perimeter](/vpc-service-controls/docs/overview), create an [egress rule](/vpc-service-controls/docs/ingress-egress-rules) that allows access to the bucket.\n\nTemplate parameters\n-------------------\n\nRun the WordCount template\n--------------------------\n\n### Console\n\n1. Go to the Dataflow **Create job from template** page.\n[Go to Create job from template](https://console.cloud.google.com/dataflow/createjob)\n2. In the **Job name** field, enter a unique job name.\n3. Optional: For **Regional endpoint** , select a value from the drop-down menu. The default region is `us-central1`.\n\n\n For a list of regions where you can run a Dataflow job, see\n [Dataflow locations](/dataflow/docs/resources/locations).\n4. From the **Dataflow template** drop-down menu, select the WordCount template.\n5. In the provided parameter fields, enter your parameter values.\n6. Click **Run job**.\n\n### gcloud\n\n\n| **Note:** To use the Google Cloud CLI to run classic templates, you must have [Google Cloud CLI](/sdk/docs/install) version 138.0.0 or later.\n\nIn your shell or terminal, run the template:\n\n\u003cbr /\u003e\n\n gcloud dataflow jobs run JOB_NAME \\\n --gcs-location gs://dataflow-templates/latest/Word_Count \\\n --region REGION_NAME \\\n --parameters \\\n inputFile=gs://dataflow-samples/shakespeare/kinglear.txt,output=gs://BUCKET_NAME/output/my_output\n\n\nReplace the following:\n\n- `JOB_NAME`:\n a unique job name of your choice\n\n- `REGION_NAME`:\n the [region](/dataflow/docs/resources/locations) where you want to\n deploy your Dataflow job---for example, `us-central1`\n\n- `BUCKET_NAME`: the name of your Cloud Storage bucket\n\n\u003cbr /\u003e\n\n### API\n\n\nTo run the template using the REST API, send an HTTP POST request. For more information on the\nAPI and its authorization scopes, see\n[`projects.templates.launch`](/dataflow/docs/reference/rest/v1b3/projects.templates/launch).\n\n\u003cbr /\u003e\n\n POST https://dataflow.googleapis.com/v1b3/projects/PROJECT_ID/locations/LOCATION/templates:launch?gcsPath=gs://dataflow-templates/latest/Word_Count\n {\n \"jobName\": \"JOB_NAME\",\n \"parameters\": {\n \"inputFile\" : \"gs://dataflow-samples/shakespeare/kinglear.txt\",\n \"output\": \"gs://BUCKET_NAME/output/my_output\"\n },\n \"environment\": { \"zone\": \"us-central1-f\" }\n }\n\n\nReplace the following:\n\n- `PROJECT_ID`: the Google Cloud project ID where you want to run the Dataflow job\n\n\u003c!-- --\u003e\n\n- `JOB_NAME`:\n a unique job name of your choice\n\n- `LOCATION`:\n the [region](/dataflow/docs/resources/locations) where you want to\n deploy your Dataflow job---for example, `us-central1`\n\n- `BUCKET_NAME`: the name of your Cloud Storage bucket\n\n\u003cbr /\u003e\n\n### Template source code\n\n### Java\n\n /*\n * Copyright (C) 2016 Google LLC\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n * use this file except in compliance with the License. You may obtain a copy of\n * the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n * License for the specific language governing permissions and limitations under\n * the License.\n */\n package com.google.cloud.teleport.templates;\n\n import com.google.cloud.teleport.metadata.Template;\n import com.google.cloud.teleport.metadata.TemplateCategory;\n import com.google.cloud.teleport.metadata.TemplateParameter;\n import com.google.cloud.teleport.templates.WordCount.WordCountOptions;\n import org.apache.beam.sdk.Pipeline;\n import org.apache.beam.sdk.io.TextIO;\n import org.apache.beam.sdk.metrics.Counter;\n import org.apache.beam.sdk.metrics.Metrics;\n import org.apache.beam.sdk.options.PipelineOptions;\n import org.apache.beam.sdk.options.PipelineOptionsFactory;\n import org.apache.beam.sdk.options.ValueProvider;\n import org.apache.beam.sdk.transforms.Count;\n import org.apache.beam.sdk.transforms.DoFn;\n import org.apache.beam.sdk.transforms.MapElements;\n import org.apache.beam.sdk.transforms.PTransform;\n import org.apache.beam.sdk.transforms.ParDo;\n import org.apache.beam.sdk.transforms.SimpleFunction;\n import org.apache.beam.sdk.values.KV;\n import org.apache.beam.sdk.values.PCollection;\n\n /**\n * A template that counts words in text files.\n *\n * \u003cp\u003eCheck out \u003ca\n * href=\"https://github.com/GoogleCloudPlatform/DataflowTemplates/blob/main/v1/README_Word_Count.md\"\u003eREADME\u003c/a\u003e\n * for instructions on how to use or modify this template.\n */\n @Template(\n name = \"Word_Count\",\n category = TemplateCategory.GET_STARTED,\n displayName = \"Word Count\",\n description =\n \"Batch pipeline. Reads text from Cloud Storage, tokenizes text lines into individual words,\"\n + \" and performs frequency count on each of the words.\",\n optionsClass = WordCountOptions.class,\n contactInformation = \"https://cloud.google.com/support\")\n public class WordCount {\n\n static class ExtractWordsFn extends DoFn\u003cString, String\u003e {\n private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, \"emptyLines\");\n\n @ProcessElement\n public void processElement(ProcessContext c) {\n // Check if the line is empty.\n if (c.element().trim().isEmpty()) {\n emptyLines.inc();\n return;\n }\n\n // Split the line into words.\n String[] words = c.element().split(\"[^a-zA-Z']+\");\n\n // Output each word encountered into the output PCollection.\n for (String word : words) {\n if (!word.isEmpty()) {\n c.output(word);\n }\n }\n }\n }\n\n /** A SimpleFunction that converts a Word and Count into a printable string. */\n public static class FormatAsTextFn extends SimpleFunction\u003cKV\u003cString, Long\u003e, String\u003e {\n @Override\n public String apply(KV\u003cString, Long\u003e input) {\n return input.getKey() + \": \" + input.getValue();\n }\n }\n\n /**\n * A PTransform that converts a PCollection containing lines of text into a PCollection of\n * formatted word counts.\n */\n public static class CountWords\n extends PTransform\u003cPCollection\u003cString\u003e, PCollection\u003cKV\u003cString, Long\u003e\u003e\u003e {\n @Override\n public PCollection\u003cKV\u003cString, Long\u003e\u003e expand(PCollection\u003cString\u003e lines) {\n\n // Convert lines of text into individual words.\n PCollection\u003cString\u003e words = lines.apply(ParDo.of(new ExtractWordsFn()));\n\n // Count the number of times each word occurs.\n PCollection\u003cKV\u003cString, Long\u003e\u003e wordCounts = words.apply(Count.\u003cString\u003eperElement());\n\n return wordCounts;\n }\n }\n\n /**\n * Options supported by {@link com.google.cloud.teleport.templates.WordCount}.\n *\n * \u003cp\u003eInherits standard configuration options.\n */\n public interface WordCountOptions extends PipelineOptions {\n\n @TemplateParameter.GcsReadFile(\n order = 1,\n description = \"Input file(s) in Cloud Storage\",\n helpText =\n \"The input file pattern Dataflow reads from. Use the example file \"\n + \"(gs://dataflow-samples/shakespeare/kinglear.txt) or enter the path to your own \"\n + \"using the same format: gs://your-bucket/your-file.txt\")\n ValueProvider\u003cString\u003e getInputFile();\n\n void setInputFile(ValueProvider\u003cString\u003e value);\n\n @TemplateParameter.GcsWriteFolder(\n order = 2,\n description = \"Output Cloud Storage file prefix\",\n helpText = \"Path and filename prefix for writing output files. Ex: gs://your-bucket/counts\")\n ValueProvider\u003cString\u003e getOutput();\n\n void setOutput(ValueProvider\u003cString\u003e value);\n }\n\n public static void main(String[] args) {\n WordCountOptions options =\n PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class);\n Pipeline p = Pipeline.create(options);\n p.apply(\"ReadLines\", TextIO.read().from(options.getInputFile()))\n .apply(new CountWords())\n .apply(MapElements.via(new FormatAsTextFn()))\n .apply(\"WriteCounts\", TextIO.write().to(options.getOutput()));\n\n p.run();\n }\n }"]]