原创

【Flume】自定义sink kafka,并编译打包jar,unapproval license的问题解决

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://xxlcube.blog.csdn.net/article/details/42489885


以下是我的自定义kafka sink插件的pom文件,编译成jar包丢到flume的lib下即可使用

<?xml version="1.0" encoding="UTF-8"?>


<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  
  <groupId>flume-sinks</groupId>
  <artifactId>cmcc-kafka-sink</artifactId>
  <name>Flume Kafka Sink</name>
  <version>1.0.0</version>
  <build>
    <plugins>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-jar-plugin</artifactId>
      </plugin>
    </plugins>
  </build>


  <dependencies>
    <dependency>
      <groupId>org.apache.flume</groupId>
      <artifactId>flume-ng-sdk</artifactId>
      <version>1.5.2</version>
    </dependency>


    <dependency>
      <groupId>org.apache.flume</groupId>
      <artifactId>flume-ng-core</artifactId>
      <version>1.5.2</version>
    </dependency>


    <dependency>
      <groupId>org.apache.flume</groupId>
      <artifactId>flume-ng-configuration</artifactId>
      <version>1.5.2</version>
    </dependency>


    <dependency>
      <groupId>org.slf4j</groupId>
      <artifactId>slf4j-api</artifactId>
      <version>1.6.1</version>
    </dependency>


    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.10</version>
      <scope>test</scope>
    </dependency>


    <dependency>
      <groupId>org.apache.kafka</groupId>
      <artifactId>kafka_2.10</artifactId>
      <version>0.8.1.1</version>
    </dependency>
  </dependencies>


</project>
这里取出了parent,也取出了rat plugin,这样就避免了编译时出现的常见错误https://issues.apache.org/jira/browse/FLUME-1372

定义了几个变量

public static final String BATCH_SIZE = "batchSize";
	public static final int DEFAULT_BATCH_SIZE = 100;
    public static final String PARTITION_KEY_NAME = "cmcc.partition.key";
    public static final String ENCODING_KEY_NAME = "cmcc.encoding";
    public static final String DEFAULT_ENCODING = "UTF-8";
    public static final String CUSTOME_TOPIC_KEY_NAME = "cmcc.topic.name";
    public static final String DEFAULT_TOPIC_NAME="CMCC";



自定义sink实现需要继承AbstractSink和实现接口Configurable,并重写部分方法,如下:

package org.apache.flume.cmcc.kafka;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import kafka.javaapi.producer.Producer;
import kafka.producer.KeyedMessage;
import kafka.producer.ProducerConfig;

import org.apache.commons.lang.StringUtils;
import org.apache.flume.Channel;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.Transaction;
import org.apache.flume.conf.Configurable;
import org.apache.flume.instrumentation.SinkCounter;
import org.apache.flume.sink.AbstractSink;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableMap;

public class CmccKafkaSink extends AbstractSink implements Configurable {

	private static final Logger log = LoggerFactory
			.getLogger(CmccKafkaSink.class);

	private Properties parameters;
	private Producer<String, String> producer;
	// private Context context;
	private int batchSize;// 一次事务的event数量,整体提交
	private List<KeyedMessage<String, String>> messageList;
	private SinkCounter sinkCounter;

	@Override
	public Status process() {
		// TODO Auto-generated method stub
		Status result = Status.READY;
		Channel channel = getChannel();
		Transaction transaction = null;
		Event event = null;
		try {
			long processedEvent = 0;
			transaction = channel.getTransaction();
			transaction.begin();// 事务开始
			messageList.clear();
			for (; processedEvent < batchSize; processedEvent++) {
				event = channel.take();// 从channel取出一个事件
				if (event == null) {
					result = Status.BACKOFF;
					break;
				}
				sinkCounter.incrementEventDrainAttemptCount();
				// Map<String, String> headers = event.getHeaders();
				String partitionKey = parameters
						.getProperty(Constants.PARTITION_KEY_NAME);
				String topic = StringUtils.defaultIfEmpty(parameters
						.getProperty(Constants.CUSTOME_TOPIC_KEY_NAME),
						Constants.DEFAULT_TOPIC_NAME);
				String encoding = StringUtils.defaultIfEmpty(
						parameters.getProperty(Constants.ENCODING_KEY_NAME),
						Constants.DEFAULT_ENCODING);
				byte[] eventBody = event.getBody();
				String eventData = new String(eventBody, encoding);
				KeyedMessage<String, String> data = null;
				if (StringUtils.isEmpty(partitionKey)) {
					data = new KeyedMessage<String, String>(topic, eventData);
				} else {
					data = new KeyedMessage<String, String>(topic,
							partitionKey, eventData);
				}
				messageList.add(data);
				log.debug("Add data [" + eventData
						+ "] into messageList,position:" + processedEvent);
			}

			if (processedEvent == 0) {
				sinkCounter.incrementBatchEmptyCount();
				result = Status.BACKOFF;
			} else {
				if (processedEvent < batchSize) {
					sinkCounter.incrementBatchUnderflowCount();
				} else {
					sinkCounter.incrementBatchCompleteCount();
				}
				sinkCounter.addToEventDrainAttemptCount(processedEvent);
				producer.send(messageList);
				log.debug("Send MessageList to Kafka: [ message List size = "
						+ messageList.size() + ",processedEvent number = "
						+ processedEvent + "] ");
			}
			transaction.commit();// batchSize个事件处理完成,一次事务提交
			sinkCounter.addToEventDrainSuccessCount(processedEvent);
			result = Status.READY;
		} catch (Exception e) {
			String errorMsg = "Failed to publish events !";
			log.error(errorMsg, e);
			e.printStackTrace();
			result = Status.BACKOFF;
			if (transaction != null) {
				try {
					transaction.rollback();
					log.debug("transaction rollback success !");
				} catch (Exception ex) {
					log.error(errorMsg, ex);
					throw Throwables.propagate(ex);
				}
			}
			// throw new EventDeliveryException(errorMsg, e);
		} finally {
			if (transaction != null) {
				transaction.close();
			}
		}
		return result;
	}

	@Override
	public synchronized void start() {
		// TODO Auto-generated method stub
		log.info("Starting {}...", this);
		sinkCounter.start();
		super.start();
		ProducerConfig config = new ProducerConfig(this.parameters);
		this.producer = new Producer<String, String>(config);
		sinkCounter.incrementConnectionCreatedCount();
	}

	@Override
	public synchronized void stop() {
		// TODO Auto-generated method stub
		log.debug("Cmcc Kafka sink {} stopping...", getName());
		sinkCounter.stop();
		producer.close();
		sinkCounter.incrementConnectionClosedCount();
	}

	@Override
	public void configure(Context context) {
		// TODO Auto-generated method stub
		ImmutableMap<String, String> props = context.getParameters();
		batchSize = context.getInteger(Constants.BATCH_SIZE,
				Constants.DEFAULT_BATCH_SIZE);
		messageList = new ArrayList<KeyedMessage<String, String>>(batchSize);
		parameters = new Properties();
		for (String key : props.keySet()) {
			String value = props.get(key);
			this.parameters.put(key, value);
		}
		if (sinkCounter == null) {
			sinkCounter = new SinkCounter(getName());
		}
	}

}
以上sink同时支持了flume的内部监控


这里为了提高性能,添加了batchSize的概念,也就减少了事务提交的次数

当然当通道中已经没有event了,这时候就将之前处理的event都提交了

下面看配置

a1.sinks.k1.type=org.apache.flume.cmcc.kafka.CmccKafkaSink
a1.sinks.k1.metadata.broker.list=192.168.11.174:9092
a1.sinks.k1.partition.key=0
a1.sinks.k1.partitioner.class=org.apache.flume.cmcc.kafka.CmccPartition
a1.sinks.k1.serializer.class=kafka.serializer.StringEncoder
a1.sinks.k1.request.required.acks=0
a1.sinks.k1.max.message.size=1000000
a1.sinks.k1.cmcc.encoding=UTF-8
a1.sinks.k1.cmcc.topic.name=CMCC
a1.sinks.k1.producer.type=sync
a1.sinks.k1.serializer.class=kafka.serializer.StringEncoder
a1.sinks.k1.batchSize=100
这里我们看到,有些属性,我们在Constants中并没有定义,这是如何读取的呢,我们来看下kafka的源码就知道了:

 private ProducerConfig(VerifiableProperties props)
    {
        this.props = props;
        super();
        kafka.producer.async.AsyncProducerConfig.class.$init$(this);
        SyncProducerConfigShared.class.$init$(this);
        brokerList = props.getString("metadata.broker.list");
        partitionerClass = props.getString("partitioner.class", "kafka.producer.DefaultPartitioner");
        producerType = props.getString("producer.type", "sync");
        String prop;
        compressionCodec = liftedTree1$1(prop = props.getString("compression.codec", NoCompressionCodec$.MODULE$.name()));
        Object _tmp = null;
        compressedTopics = Utils$.MODULE$.parseCsvList(props.getString("compressed.topics", null));
        messageSendMaxRetries = props.getInt("message.send.max.retries", 3);
        retryBackoffMs = props.getInt("retry.backoff.ms", 100);
        topicMetadataRefreshIntervalMs = props.getInt("topic.metadata.refresh.interval.ms", 600000);
        ProducerConfig$.MODULE$.validate(this);
    }
kafka的源码在实例化ProducerConfig的时候会读取配置文件中的kafka配置信息的。




文章最后发布于: 2015-01-07 11:49:33
展开阅读全文
0 个人打赏
私信求帮助

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 酷酷鲨 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览