在阿里云上通过 docker 搭建大数据环境,本地想要访问 datanode

我使用 docker-compose 的模板在一台阿里云ECS (8核64GB) 上搭建了一个伪分布式集群
当我想用 python 上传文件到 HDFS 上时发生了错误

代码为

from hdfs import *

client = Client(url="http://" + hdfs_host + ":" + hdfs_port)
client.upload(hdfs_path=hdfs_path, local_path=item, cleanup=True, overwrite=True)

报错为

datanode2
50075
ERROR:root:datanode2
ERROR:root:50075
ERROR:hdfs.client:Error while uploading. Attempting cleanup.
Traceback (most recent call last):
  File "C:\Users\Yingchao Ha\AppData\Local\Programs\Python\Python39\lib\site-packages\urllib3\connection.py", line 176, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\Yingchao Ha\AppData\Local\Programs\Python\Python39\lib\site-packages\urllib3\util\connection.py", line 96, in create_connection
    raise err
  File "C:\Users\Yingchao Ha\AppData\Local\Programs\Python\Python39\lib\site-packages\urllib3\util\connection.py", line 86, in create_connection
    sock.connect(sa)
TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。

问题分析:
根据在网络上查询的解决方案,大约是可以访问 namenode,但不能访问 datanode,我想尝试一条解决方案就是修改 hdfs-site.xml

<property>
    <name>dfs.datanode.use.datanode.hostname</name>
    <value>true</value>
</property>

但是我是通过 docker-compose 搭建的,所以想知道如何修改 hdfs-site.xml
hadoop.env 如下

CORE_CONF_fs_defaultFS=hdfs://namenode:9000
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hive_hosts=*
CORE_CONF_hadoop_proxyuser_hive_groups=*
CORE_CONF_hadoop_proxyuser_hbase_hosts=*
CORE_CONF_hadoop_proxyuser_hbase_groups=*

HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false 

YARN_CONF_yarn_log___aggregation___enable=true
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
YARN_CONF_yarn_timeline___service_enabled=true
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
YARN_CONF_yarn_timeline___service_hostname=historyserver
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
YARN_CONF_yarn_resourcemanager_resource___tracker_address=resourcemanager:8031
YARN_CONF_yarn_timeline___service_enabled=true
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
YARN_CONF_yarn_timeline___service_hostname=historyserver
YARN_CONF_mapreduce_map_output_compress=true
YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec
YARN_CONF_yarn_nodemanager_resource_memory___mb=16384
YARN_CONF_yarn_nodemanager_resource_cpu___vcores=8
YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle

MAPRED_CONF_mapreduce_framework_name=yarn
MAPRED_CONF_mapred_child_java_opts=-Xmx4096m
MAPRED_CONF_mapreduce_map_memory_mb=4096
MAPRED_CONF_mapreduce_reduce_memory_mb=8192
MAPRED_CONF_mapreduce_map_java_opts=-Xmx3072m
MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx6144m
MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=/opt/hadoop-2.7.4/
MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=/opt/hadoop-2.7.4/
MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=/opt/hadoop-2.7.4/

HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql:5432/metastore
HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
HIVE_SITE_CONF_hive_metastore_uris=thrift://hive-metastore:9083
HIVE_SITE_CONF_hbase_zookeeper_quorum=zoo1:2181,zoo2:2181,zoo3:2181

同时,docker-compose.yml 部分相关代码如下

services:
  namenode:
    image: bde2020/hadoop-namenode:2.0.0-hadoop2.7.4-java8
    container_name: namenode
    hostname: namenode
    volumes:
      - ./data/hadoop/namenode:/hadoop/dfs/name
    environment:
      CLUSTER_NAME: "hadoop"
    env_file:
      - ./hadoop.env
      - ./datanode1.env
      - ./datanode2.env
      - ./datanode3.env
    ports:
      - 50070:50070
      - 9000:9000
      - 8020:8020
    networks:
      default:
        ipv4_address: 172.23.0.31

  resourcemanager:
    image: bde2020/hadoop-resourcemanager:2.0.0-hadoop2.7.4-java8
    container_name: resourcemanager
    hostname: resourcemanager
    depends_on:
      - namenode
      - datanode1
      - datanode2
      - datanode3
    env_file:
      - ./hadoop.env
      - ./datanode1.env
      - ./datanode2.env
      - ./datanode3.env
    environment:
      SERVICE_PRECONDITION: "namenode:50070 datanode1:50075 datanode2:50075 datanode3:50075"
    ports:
      - 8088:8088
      - 8030:8030
      - 8031:8031
      - 8032:8032
      - 8033:8033
    networks:
      default:
        ipv4_address: 172.23.0.44

  historyserver:
    image: bde2020/hadoop-historyserver:2.0.0-hadoop2.7.4-java8
    container_name: historyserver
    hostname: historyserver
    depends_on:
      - namenode
      - datanode1
      - datanode2
      - datanode3
    env_file:
      - ./hadoop.env
      - ./datanode1.env
      - ./datanode2.env
      - ./datanode3.env
    environment:
      SERVICE_PRECONDITION: "namenode:50070 datanode1:50075 datanode2:50075 datanode3:50075 resourcemanager:8088"
    ports:
      - 8188:8188
    networks:
      default:
        ipv4_address: 172.23.0.33

  nodemanager1:
    image: bde2020/hadoop-nodemanager:2.0.0-hadoop2.7.4-java8
    container_name: nodemanager1
    hostname: nodemanager1
    depends_on:
      - namenode
      - datanode1
      - datanode2
      - datanode3
    env_file:
      - ./hadoop.env
      - ./datanode1.env
      - ./datanode2.env
      - ./datanode3.env
    ports:
      - 8142:8042
      - 8140:8040
      - 41655:46655
    environment:
      SERVICE_PRECONDITIO: "namenode:50070 datanode1:50075 datanode2:50075 datanode3:50075 resourcemanager:8088"
    networks:
      default:
        ipv4_address: 172.23.0.41

  nodemanager2:
    image: bde2020/hadoop-nodemanager:2.0.0-hadoop2.7.4-java8
    container_name: nodemanager2
    hostname: nodemanager2
    depends_on:
      - namenode
      - datanode1
      - datanode2
      - datanode3
    env_file:
      - ./hadoop.env
      - ./datanode1.env
      - ./datanode2.env
      - ./datanode3.env
    ports:
      - 8242:8042
      - 8240:8040
      - 42655:46655
    environment:
      SERVICE_PRECONDITION: "namenode:50070 datanode1:50075 datanode2:50075 datanode3:50075 resourcemanager:8088"
    networks:
      default:
        ipv4_address: 172.23.0.42

  nodemanager3:
    image: bde2020/hadoop-nodemanager:2.0.0-hadoop2.7.4-java8
    container_name: nodemanager3
    hostname: nodemanager3
    depends_on:
      - namenode
      - datanode1
      - datanode2
      - datanode3
    env_file:
      - ./hadoop.env
      - ./datanode1.env
      - ./datanode2.env
      - ./datanode3.env
    ports:
      - 8342:8042
      - 8340:8040
      - 43655:46655
    environment:
      SERVICE_PRECONDITION: "namenode:50070 datanode1:50075 datanode2:50075 datanode3:50075 resourcemanager:8088"
    networks:
      default:
        ipv4_address: 172.23.0.43

  datanode1:
    image: bde2020/hadoop-datanode:2.0.0-hadoop2.7.4-java8
    container_name: datanode1
    hostname: datanode1
    depends_on:
      - namenode
    env_file:
      - ./hadoop.env
      - ./datanode1.env
    volumes:
      - ./data/hadoop/datanode1:/hadoop/dfs/data
    ports:
      - 50175:50075
      - 50110:50110
    environment:
      SERVICE_PRECONDITION: "namenode:50070"
    networks:
      default:
        ipv4_address: 172.23.0.35

  datanode2:
    image: bde2020/hadoop-datanode:2.0.0-hadoop2.7.4-java8
    container_name: datanode2
    hostname: datanode2
    depends_on:
      - namenode
    env_file:
      - ./hadoop.env
      - ./datanode2.env
    volumes:
      - ./data/hadoop/datanode2:/hadoop/dfs/data
    ports:
      - 50275:50075
      - 50210:50210
    environment:
      SERVICE_PRECONDITION: "namenode:50070"
    networks:
      default:
        ipv4_address: 172.23.0.36

  datanode3:
    image: bde2020/hadoop-datanode:2.0.0-hadoop2.7.4-java8
    container_name: datanode3
    hostname: datanode3
    depends_on:
      - namenode
    env_file:
      - ./hadoop.env
      - ./datanode3.env
    volumes:
      - ./data/hadoop/datanode3:/hadoop/dfs/data
    ports:
      - 50375:50075
      - 50310:50310
    environment:
      SERVICE_PRECONDITION: "namenode:50070"
    networks:
      default:
        ipv4_address: 172.23.0.37

本地 hosts 文件已修改,web UI均可访问,另外,第二个问题是,我如何修改yarn参数来匹配目前 8核64GB 的配置
之前改过一些,重启后好像并没有解决,还遇到过spark无法申请到资源的问题,所以才升级了机器性能

你好,我是有问必答小助手,非常抱歉,本次您提出的有问必答问题,技术专家团超时未为您做出解答


本次提问扣除的有问必答次数,将会以问答VIP体验卡(1次有问必答机会、商城购买实体图书享受95折优惠)的形式为您补发到账户。


因为有问必答VIP体验卡有效期仅有1天,您在需要使用的时候【私信】联系我,我会为您补发。