![[Data Engineering] Docker와 WSL로 구현하는 Kafka 클러스터: 데이터 엔지니어링 환경 구축 튜토리얼](/static/b90b5c5db923c27771a4d12004133bc5/f6053/dockerkafka.png)
삭제 $ sudo apt-get remove docker docker-engine docker.io containerd runc
$ sudo apt-get update $ sudo apt-get install \ ca-certificates \ curl \ gnupg \ lsb-release
$ curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
$ echo \ "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \ $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
$ sudo apt-get update $ sudo apt-get install docker-ce docker-ce-cli containerd.io docker-compose-plugin
$ apt-cache madison docker-ce docker-ce | 5:20.10.15~3-0~ubuntu-focal | https://download.docker.com/linux/ubuntu focal/stable amd64 Packages docker-ce | 5:20.10.14~3-0~ubuntu-focal | https://download.docker.com/linux/ubuntu focal/stable amd64 Packages docker-ce | 5:20.10.13~3-0~ubuntu-focal | https://download.docker.com/linux/ubuntu focal/stable amd64 Packages docker-ce | 5:20.10.12~3-0~ubuntu-focal | https://download.docker.com/linux/ubuntu focal/stable amd64 Packages docker-ce | 5:20.10.11~3-0~ubuntu-focal | https://download.docker.com/linux/ubuntu focal/stable amd64 Packages docker-ce | 5:20.10.10~3-0~ubuntu-focal | https://download.docker.com/linux/ubuntu focal/stable amd64 Packages
$ sudo apt-get install docker-ce=<VERSION_STRING> docker-ce-cli=<VERSION_STRING> containerd.io docker-compose-plugin
$ root@L-wslee:~# docker run hello-world
docker: Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?.
See 'docker run --help'.
$ root@L-wslee:~#
root@L-wslee:~# systemctl start docker
System has not been booted with systemd as init system (PID 1). Can't operate.
Failed to connect to bus: Host is down
$ sudo apt-get install cgroupfs-mount $ sudo cgroupfs-mount $ sudo service docker start
$ root@L-wslee:~# docker run hello-world Unable to find image 'hello-world:latest' locally latest: Pulling from library/hello-world 2db29710123e: Pull complete Digest: sha256:10d7d58d5ebd2a652f4d93fdd86da8f265f5318c6a73cc5b6a9798ff6d2b2e67 Status: Downloaded newer image for hello-world:latest Hello from Docker!
$ curl -SL https://github.com/docker/compose/releases/download/v2.5.0/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose $ sudo chmod +x /usr/local/bin/docker-compose $ sudo ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose # Version 확인 $ docker-compose --version root@L-wslee:/home/nasa1515/docker# docker-compose --version Docker Compose version v2.5.0
Kafka는 Broker에서 Topic의 Metadata를 저장하기 위해 Zookeeper를 사용합니다.
일반적으로 Zookeeper를 Standalone으로 구성할 수 있지만, 일반 Hadoop과 동일하게 실제 운영에서는 잘 찾아보기 힘듭니다.
때문에 Zookeeper를 Cluster로 구성하고, HA를 확보한 것을 Zookeeper Ensemble 이라고 합니다.
$ mkdir docker
FROM ubuntu:20.04
RUN mkdir -p /root/install
RUN apt-get update
WORKDIR /root/install
# java 설치
ENV DEBIAN_FRONTEND noninteractive
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
RUN apt-get install openjdk-8-jdk -y
RUN apt-get install wget -y
RUN apt-get install vim -y
# zookeeper 설치
RUN wget downloads.apache.org/zookeeper/zookeeper-3.7.0/apache-zookeeper-3.7.0-bin.tar.gz
RUN tar -zxvf apache-zookeeper-3.7.0-bin.tar.gz
RUN mv apache-zookeeper-3.7.0-bin /usr/local/zookeeper
# 설정파일 및 초기화 파일 복사
COPY config/zoo.cfg /usr/local/zookeeper/conf/zoo.cfg
COPY config/init.sh init.sh
# windows에서 작업 시 CRLF와 LF 처리 방식 문제 방지
RUN sed -i 's/\r//g' init.sh
RUN sed -i 's/\r//g' /usr/local/zookeeper/conf/zoo.cfg
CMD bash init.sh
$ mkdir config
mkdir -p /data # 주키퍼는 myid 파일로 클러스터를 구분한다. 1~255까지 번호를 지정할 수 있다. echo $MY_ID > /data/myid # 주키퍼 서버를 실행한다. /usr/local/zookeeper/bin/zkServer.sh start # 자동으로 종료되지 않도록 방지한다. tail -f /dev/null
# 팔로워가 리더에 접속할 수 있는 시간
# initLimit * tickTime = 40초 로 설정된다.
initLimit=10
tickTime=2000
# 리더가 될 수 있는 팔로워의 최대 갯수를 나타낸다.
syncLimit=5
# myid가 저장될 디렉토리 위치이다.
dataDir=/data
# 클라이언트가 접속할 포트 번호이다.
clientPort=2181
# 앙상블을 이루는 서버 정보이다.
# server.X=hostname:peerPort:leaderPort
# peerPort는 앙상블 서버들이 상호 통신하는 데 사용되는 포트 번호이다.
# leaderPort는 리더를 선출하는 데 사용되는 포트 번호이다.
# -- 3888뒤에 공백 조심!
server.1=nasa1515-zookeeper-1:2888:3888
server.2=nasa1515-zookeeper-2:2888:3888
server.3=nasa1515-zookeeper-3:2888:3888
# 자동으로 생성되는 스냅샷을 24시간마다 최대 3개를 유지하고 나머지는 제거한다.
autopurge.snapRetainCount=3
autopurge.purgeInterval=24
$ docker network create zoo ebf8d2ee0ebbac5acce268f5935e5bd80b29ef2b3f29931054b347f8d7c27e8a root@L-wslee:/home/nasa1515/docker# docker network list NETWORK ID NAME DRIVER SCOPE f10c3aaa8146 bridge bridge local c5b41392b3bd host host local 36f1ffafa37e none null local ebf8d2ee0ebb zoo bridge local
root@L-wslee:/home/nasa1515/docker# ls -alrt * -rwxrwxrwx 1 root root 836 May 9 14:24 Dockerfile config: total 16 -rw-r--r-- 1 root root 315 May 9 14:20 init.sh -rwxrwxrwx 1 root root 913 May 9 14:21 zoo.cfg drwxr-xr-x 3 root root 4096 May 9 14:30 .. drwxr-xr-x 2 root root 4096 May 9 14:30 .
$ docker build --tag nasa1515-zookeeper . ... ... ... Removing intermediate container bb88073c3b9a ---> 6e4b73214693 Step 16/17 : RUN sed -i 's/\r//g' /usr/local/zookeeper/conf/zoo.cfg ---> Running in e19594c509af Removing intermediate container e19594c509af ---> 57ab1963d806 Step 17/17 : CMD bash init.sh ---> Running in 46a08706b3a4 Removing intermediate container 46a08706b3a4 ---> ade37e220c72 Successfully built ade37e220c72 Successfully tagged nasa1515-zookeeper:latest
$ docker image ls REPOSITORY TAG IMAGE ID CREATED SIZE nasa1515-zookeeper latest ade37e220c72 42 seconds ago 733MB ubuntu 20.04 53df61775e88 9 days ago 72.8MB hello-world latest feb5d9fea6a5 7 months ago 13.3kB
version: '3.8'
volumes:
nasa1515-zookeeper-1-volume:
name: nasa1515-zookeeper-1-volume
nasa1515-zookeeper-2-volume:
name: nasa1515-zookeeper-2-volume
nasa1515-zookeeper-3-volume:
name: nasa1515-zookeeper-3-volume
services:
nasa1515-zookeeper-1:
image: nasa1515-zookeeper
container_name: nasa1515-zookeeper-1
restart: always
hostname: nasa1515-zookeeper-1
environment:
MY_ID: 1
volumes:
- nasa1515-zookeeper-1-volume:/data
nasa1515-zookeeper-2:
image: nasa1515-zookeeper
container_name: nasa1515-zookeeper-2
restart: always
hostname: nasa1515-zookeeper-2
environment:
MY_ID: 2
volumes:
- nasa1515-zookeeper-2-volume:/data
nasa1515-zookeeper-3:
image: nasa1515-zookeeper
container_name: nasa1515-zookeeper-3
restart: always
hostname: nasa1515-zookeeper-3
environment:
MY_ID: 3
volumes:
- nasa1515-zookeeper-3-volume:/data
networks:
default:
name: zoo
$ docker-compose up -d; root@L-wslee:/home/nasa1515/docker# docker-compose up -d; [+] Running 4/4 ⠿ Network zoo Created 0.0s ⠿ Container nasa1515-zookeeper-3 Started 0.6s ⠿ Container nasa1515-zookeeper-1 Started 0.8s ⠿ Container nasa1515-zookeeper-2 Started
$ docker exec nasa1515-zookeeper-1 /usr/local/zookeeper/bin/zkServer.sh status $ docker exec nasa1515-zookeeper-2 /usr/local/zookeeper/bin/zkServer.sh status $ docker exec nasa1515-zookeeper-3 /usr/local/zookeeper/bin/zkServer.sh status
root@L-wslee:/home/nasa1515/docker# docker exec nasa1515-zookeeper-1 /usr/local/zookeeper/bin/zkServer.sh status ZooKeeper JMX enabled by default Using config: /usr/local/zookeeper/bin/../conf/zoo.cfg Client port found: 2181. Client address: localhost. Client SSL: false. Mode: follower root@L-wslee:/home/nasa1515/docker# docker exec nasa1515-zookeeper-2 /usr/local/zookeeper/bin/zkServer.sh status ZooKeeper JMX enabled by default Using config: /usr/local/zookeeper/bin/../conf/zoo.cfg Client port found: 2181. Client address: localhost. Client SSL: false. Mode: follower root@L-wslee:/home/nasa1515/docker# docker exec nasa1515-zookeeper-3 /usr/local/zookeeper/bin/zkServer.sh status ZooKeeper JMX enabled by default Using config: /usr/local/zookeeper/bin/../conf/zoo.cfg Client port found: 2181. Client address: localhost. Client SSL: false. Mode: leader
Zookeeper Cluster의 구성을 완료했다면,
이번에는 Kafka를 Docker로 구성한 뒤 Zookeeper Cluster와 연동하는 작업을 진행합니다.
간단하게 Kafka Broker는 Zookeeper와 동일하게 3개로 구성하고 크기를 자유롭게 변경 가능하도록 하려고 합니다.
$ mkdir kafka-broker
FROM ubuntu:20.04 RUN mkdir -p /root/install RUN apt-get update WORKDIR /root/install ENV DEBIAN_FRONTEND noninteractive ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 RUN apt-get install openjdk-8-jdk -y RUN apt-get install wget -y RUN apt-get install vim -y RUN wget https://downloads.apache.org/kafka/3.1.0/kafka_2.12-3.1.0.tgz RUN tar -zxvf kafka_2.12-3.1.0.tgz RUN mv kafka_2.12-3.1.0 /usr/local/kafka RUN mkdir /data COPY config/init.sh init.sh RUN sed -i 's/\r//g' init.sh COPY config/server.properties /usr/local/kafka/config/server.properties RUN sed -i 's/\r//g' /usr/local/kafka/config/server.properties CMD bash init.sh
$ mkdir config
#!/bin/bash
$ sed -i "s/{{broker_id}}/$BROKER_ID/" /usr/local/kafka/config/server.properties
/usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server.properties
num.network.threads=3
num.io.threads=8
socket.send.buffer.bytes=102400
socket.receive.buffer.bytes=102400
socket.request.max.bytes=104857600
log.dirs=/data
num.partitions=1
num.recovery.threads.per.data.dir=1
offsets.topic.replication.factor=1
transaction.state.log.replication.factor=1
transaction.state.log.min.isr=1
log.retention.hours=168
log.segment.bytes=1073741824
log.retention.bytes=5368709120
log.retention.check.interval.ms=300000
zookeeper.connect=nasa1515-zookeeper-1:2181,nasa1515-zookeeper-2:2181,nasa1515-zookeeper-3:2181/default-kafka
zookeeper.connection.timeout.ms=18000
group.initial.rebalance.delay.ms=0
auto.create.topics.enable=true
broker.id={{broker_id}}
version: '3.8'
volumes:
nasa1515-kafka-1-volume:
name: nasa1515-kafka-1-volume
nasa1515-kafka-2-volume:
name: nasa1515-kafka-2-volume
nasa1515-kafka-3-volume:
name: nasa1515-kafka-3-volume
networks:
default:
name: zoo
services:
nasa1515-kafka-1:
container_name: nasa1515-kafka-1
environment:
BROKER_ID: 1
hostname: nasa1515-kafka-1
image: nasa1515-kafka
restart: always
volumes:
- nasa1515-kafka-1-volume:/data
nasa1515-kafka-2:
container_name: nasa1515-kafka-2
environment:
BROKER_ID: 2
hostname: nasa1515-kafka-2
image: nasa1515-kafka
restart: always
volumes:
- nasa1515-kafka-2-volume:/data
nasa1515-kafka-3:
container_name: nasa1515-kafka-3
environment:
BROKER_ID: 3
hostname: nasa1515-kafka-3
image: nasa1515-kafka
restart: always
volumes:
- nasa1515-kafka-3-volume:/data
root@L-wslee:/home/nasa1515/docker/kafka-broker# ls -alrt * -rwxrwxrwx 1 root root 639 May 10 11:11 Dockerfile -rwxrwxrwx 1 root root 919 May 10 11:14 docker-compose.yml config: total 16 -rw-r--r-- 1 root root 178 May 10 11:12 init.sh -rw-r--r-- 1 root root 681 May 10 11:13 server.properties drwxrwxrwx 2 root root 4096 May 10 11:13 . drwxr-xr-x 3 root root 4096 May 10 11:14 ..
$ docker build --tag nasa1515-kafka . ... ... ... Step 17/18 : RUN sed -i 's/\r//g' /usr/local/kafka/config/server.properties ---> Running in c653a7573fc3 Removing intermediate container c653a7573fc3 ---> 598ea0a00be3 Step 18/18 : CMD bash init.sh ---> Running in f8a3710be663 Removing intermediate container f8a3710be663 ---> 5581d8201d8b Successfully built 5581d8201d8b Successfully tagged nasa1515-kafka:latest
root@L-wslee:/home/nasa1515/docker/kafka-broker# docker image ls REPOSITORY TAG IMAGE ID CREATED SIZE nasa1515-kafka latest 5581d8201d8b About a minute ago 920MB nasa1515-zookeeper latest e71e36444916 19 hours ago 737MB ubuntu 20.04 53df61775e88 10 days ago 72.8MB hello-world latest feb5d9fea6a5 7 months ago 13.3kB
$ docker-compose up -d; ... ... root@L-wslee:/home/nasa1515/docker/kafka-broker# docker-compose up -d; [+] Running 7/7 ⠿ Network zoo Created 0.0s ⠿ Volume "nasa1515-kafka-1-volume" Created 0.0s ⠿ Volume "nasa1515-kafka-2-volume" Created 0.0s ⠿ Volume "nasa1515-kafka-3-volume" Created 0.0s ⠿ Container nasa1515-kafka-1 Started 1.1s ⠿ Container nasa1515-kafka-3 Started 1.1s ⠿ Container nasa1515-kafka-2 Started 1.2s
$ docker ps ... ... root@L-wslee:/home/nasa1515/docker/kafka-broker# docker ps CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 5aed89a22bec nasa1515-kafka "/bin/sh -c 'bash in…" About a minute ago Up 19 seconds nasa1515-kafka-1 2bea2dcd9062 nasa1515-kafka "/bin/sh -c 'bash in…" About a minute ago Up 18 seconds nasa1515-kafka-3 8ff56e284749 nasa1515-kafka "/bin/sh -c 'bash in…" About a minute ago Up 18 seconds nasa1515-kafka-2
$ docker logs nasa1515-kafka-1 ... ... ... # BrokerToControllerChannelManager broker=3 브로커 3개가 정상적으로 연결되었습니다 [2022-05-10 02:26:33,654] INFO [KafkaServer id=3] started (kafka.server.KafkaServer) [2022-05-10 02:26:33,928] INFO [BrokerToControllerChannelManager broker=3 name=alterIsr]: Recorded new controller, from now on will use broker nasa1515-kafka-3:9092 (id: 3 rack: null) (kafka.server.BrokerToControllerRequestThread) [2022-05-10 02:26:33,930] INFO [BrokerToControllerChannelManager broker=3 name=forwarding]: Recorded new controller, from now on will use broker nasa1515-kafka-3:9092 (id: 3 rack: null) (kafka.server.BrokerToControllerRequestThread)
$ mkdir confluent
FROM ubuntu:20.04
RUN mkdir -p /root/install
RUN apt-get update
WORKDIR /root/install
ENV DEBIAN_FRONTEND noninteractive
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
RUN apt-get install openjdk-8-jdk -y
RUN apt-get install wget -y
RUN apt-get install vim -y
# confluent-community 설치
RUN wget http://packages.confluent.io/archive/7.1/confluent-community-7.1.1.tar.gz
RUN tar -zxvf confluent-community-7.1.1.tar.gz
RUN mv confluent-7.1.1 /usr/local/confluent
# kafka-rest 설정파일 복사
COPY config/kafka-rest.properties /usr/local/confluent/etc/kafka-rest/kafka-rest.properties
RUN sed -i 's/\r//g' /usr/local/confluent/etc/kafka-rest/kafka-rest.properties
# kakfa-rest 실행
CMD /usr/local/confluent/bin/kafka-rest-start /usr/local/confluent/etc/kafka-rest/kafka-rest.properties
$ mkdir config
# kakfa ID id=default # schema.registry.url=http://localhost:8081 zookeeper.connect=nasa1515-zookeeper-1:2181,nasa1515-zookeeper-2:2181,nasa1515-zookeeper-3:2181 bootstrap.servers=PLAINTEXT://nasa1515-kafka-1:9092,PLAINTEXT://nasa1515-kafka-2:9092,PLAINTEXT://nasa1515-kafka-3:9092
version: '3.8'
networks:
default:
name: zoo
services:
pipeline-confluent-kafka-rest:
container_name: pipeline-confluent-kafka-rest
hostname: pipeline-confluent-kafka-rest
image: pipeline-confluent-kafka-rest
restart: always
ports:
- 8082:8082
root@L-wslee:/home/nasa1515/docker/Confluent# ls -alrt * -rwxrwxrwx 1 root root 798 May 10 13:44 Dockerfile -rwxrwxrwx 1 root root 258 May 10 13:45 docker-compose.yml config: total 12 -rw-r--r-- 1 root root 285 May 10 13:44 kafka-rest.properties drwxrwxrwx 2 root root 4096 May 10 13:44 . drwxr-xr-x 3 root root 4096 May 10 13:44 ..
$ docker build --tag nasa1515-confluent-kafka . ... ... Step 15/15 : CMD /usr/local/confluent/bin/kafka-rest-start /usr/local/confluent/etc/kafka-rest/kafka-rest.properties ---> Running in 65ca3e623728 Removing intermediate container 65ca3e623728 ---> 661f80b47926 Successfully built 661f80b47926 Successfully tagged nasa1515-confluent-kafka:latest
root@L-wslee:/home/nasa1515/docker/Confluent# docker image ls REPOSITORY TAG IMAGE ID CREATED SIZE nasa1515-confluent-kafka latest 661f80b47926 22 seconds ago 1.99GB nasa1515-zookeeper latest b2b8ec726801 2 hours ago 737MB nasa1515-kafka latest 5581d8201d8b 3 hours ago 920MBz ubuntu 20.04 53df61775e88 10 days ago 72.8MB hello-world latest feb5d9fea6a5 7 months ago 13.3kB
$ docker-compose up -d ... ... [+] Running 1/1 ⠿ Container nasa1515-confluent-kafka Started 0.6s
$ docker ps ... ... root@L-wslee:/home/nasa1515/docker/Confluent# docker ps CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES d1f069f93aca nasa1515-confluent-kafka "/bin/sh -c '/usr/lo…" 34 seconds ago Up 33 seconds 0.0.0.0:8082->8082/tcp, :::8082->8082/tcp nasa1515-confluent-kafka 685b2f6adc7f nasa1515-kafka "/bin/sh -c 'bash in…" 2 hours ago Up 2 hours nasa1515-kafka-3 86ee421b76a8 nasa1515-kafka "/bin/sh -c 'bash in…" 2 hours ago Up 2 hours nasa1515-kafka-1 10c8c2384fed nasa1515-kafka "/bin/sh -c 'bash in…" 2 hours ago Up 2 hours nasa1515-kafka-2 d3fe193618f7 nasa1515-zookeeper "/bin/sh -c 'bash in…" 2 hours ago Up 2 hours nasa1515-zookeeper-1 021037ca8443 nasa1515-zookeeper "/bin/sh -c 'bash in…" 2 hours ago Up 2 hours nasa1515-zookeeper-3 6f9cd8cdd917 nasa1515-zookeeper "/bin/sh -c 'bash in…" 2 hours ago Up 2 hours nasa1515-zookeeper-2
import requests import json
headers = { 'Content-Type': 'application/vnd.kafka.json.v2+json', }
data = '{"records":[{"value":{"id":"probiotics"}}]}'
response = requests.post('http://localhost:8082/topics/nasa1515', headers=headers, data=data)
print(response)
print(json.dumps(response.json(), indent=4))
<Response [200]>
{
"offsets": [
{
"partition": 0,
"offset": 0,
"error_code": null,
"error": null
}
],
"key_schema_id": null,
"value_schema_id": null
}
import requests
import json
response = requests.get('http://localhost:8082/topics/nasa1515/ ')
print(response)
print(json.dumps(response.json(), indent=4))
<Response [200]>
{
"name": "nasa1515",
"configs": {
"compression.type": "producer",
"leader.replication.throttled.replicas": "",
"message.downconversion.enable": "true",
"min.insync.replicas": "1",
"segment.jitter.ms": "0",
"cleanup.policy": "delete",
"flush.ms": "9223372036854775807",
"follower.replication.throttled.replicas": "",
"segment.bytes": "1073741824",
"retention.ms": "604800000",
"flush.messages": "9223372036854775807",
"message.format.version": "3.0-IV1",
"file.delete.delay.ms": "60000",
"max.compaction.lag.ms": "9223372036854775807",
"max.message.bytes": "1048588",
"min.compaction.lag.ms": "0",
"message.timestamp.type": "CreateTime",
"preallocate": "false",
"min.cleanable.dirty.ratio": "0.5",
"index.interval.bytes": "4096",
"unclean.leader.election.enable": "false",
"retention.bytes": "5368709120",
"delete.retention.ms": "86400000",
"segment.ms": "604800000",
"message.timestamp.difference.max.ms": "9223372036854775807",
"segment.index.bytes": "10485760"
},
"partitions": [
{
"partition": 0,
"leader": 3,
"replicas": [
{
"broker": 3,
"leader": true,
"in_sync": true
}
]
}
]
}
아무래도 모든 Cluster들을 Docker Container로 구동하다보니깐 뭔가 일일히 확인하기가 너무 힘들다.
그래서 구글링을 하다보니 통합 GUI를 지원해주는 툴을 발견했다. 물론 얘도 도커로 띄운다..
$ mkdir kafdrop
version: "3.8"
services:
kafdrop:
image: obsidiandynamics/kafdrop
restart: "always"
ports:
- "9000:9000"
environment:
KAFKA_BROKERCONNECT: "nasa1515-kafka-1:9092,nasa1515-kafka-2:9092,nasa1515-kafka-3:9092"
JVM_OPTS: "-Xms32M -Xmx64M"
networks:
default:
name: zoo
$ docker-compose up -d;
