大数据平台配置文件汇总
1.Hadoop相关配置文件
1.1core-site.xml
<?xml version="1.0" encoding="utf-8"?> <configuration> <!-- HA 指定hdfs的nameservice为bdpha--> <property> <name>fs.defaultFS</name> <value>hdfs://bdpha</value> </property> <!-- 临时目录用来存放nn临时文件 --> <property> <name>hadoop.tmp.dir</name> <value>/bdp/tmp</value> </property> <!-- 设置故障转移的zookeeper集群--> <property> <name>ha.zookeeper.quorum</name> <value>aaa-node1:2181,aaa-node7:2181,aaa-node5:2181</value> </property> <property> <name>hadoop.proxyuser.hadoop.hosts</name> <value>*</value> </property> <property> <name>hadoop.proxyuser.hadoop.groups</name> <value>*</value> </property> <property> <name>hadoop.proxyuser.hue.hosts</name> <value>*</value> </property> <property> <name>hadoop.proxyuser.hue.groups</name> <value>*</value> </property> <!-- 回收站存储时间 --> <property> <name>fs.trash.interval</name> <value>1440</value> </property> <!-- 支持LZO压缩 --> <property> <name>io.compression.codecs</name> <value>org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,com.hadoop.compression.lzo.LzoCodec,com.hadoop.compression.lzo.LzopCodec</value> </property> </configuration>
1.2hadoop-env.sh
export JAVA_HOME=/opt/java/jre/
export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"}
export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true"
export HADOOP_NAMENODE_OPTS="-XX:MaxPermSize=2048m -Xmx16000m -Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS"
export HADOOP_DATANODE_OPTS="-XX:MaxPermSize=1024m -Xmx2048m -Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS"
export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS"
export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS"
export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS"
export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS"
export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER}
export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}
export HADOOP_PID_DIR=/opt/hadoop/PID
export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}
export HADOOP_IDENT_STRING=$USER
export LD_LIBRARY_PATH=/usr/local/hadoop/lzo/lib:$LD_LIBRARY_PATH
1.3hdfs-site.xml
<?xml version="1.0" encoding="utf-8"?> <configuration> <!-- hdfs的nameservice名称--> <property> <name>dfs.nameservices</name> <value>bdpha</value> </property> <!-- 集群中NameNode节点 --> <property> <name>dfs.ha.namenodes.bdpha</name> <value>nn1,nn2</value> </property> <!-- nn1的RPC通信地址 --> <property> <name>dfs.namenode.rpc-address.bdpha.nn1</name> <value>aaa-master2:9000</value> </property> <!-- nn2的RPC通信地址 --> <property> <name>dfs.namenode.rpc-address.bdpha.nn2</name> <value>aaa-master1:9000</value> </property> <!-- nn2的RPC通信地址 --> <property> <name>dfs.namenode.http-address.bdpha.nn1</name> <value>aaa-master2:50070</value> </property> <!-- nn2的http通信地址 --> <property> <name>dfs.namenode.http-address.bdpha.nn2</name> <value>aaa-master1:50070</value> </property> <!-- 指定journal日志保存的目录,这个目录只能是一个,不能是多个只能是一个目录 --> <property> <name>dfs.journalnode.edits.dir</name> <value>/data/bdp/journalnode</value> </property> <!-- 配置journalnode保存edits的目录,最后的bdpha不一定要与nameservice的名称一样 因为,它会在${dfs.journalnode.edits.dir}/ns目录下 --> <property> <name>dfs.namenode.shared.edits.dir</name> <value>qjournal://aaa-node2:8485;aaa-master2:8485;aaa-master1:8485/bdpha</value> </property> <!-- 配置隔离机制,即同一时刻只能有一台服务器对外响应 --> <property> <name>dfs.ha.fencing.methods</name> <value>shell(/bin/true)</value> </property> <!-- 使用隔离机制时需要ssh无秘钥登录 --> <property> <name>dfs.ha.fencing.ssh.private-key-files</name> <value>/home/hadoop/.ssh/id_rsa</value> </property> <!-- 使用隔离机制时需要ssh无秘钥登录超时时长--> <property> <name>dfs.ha.fencing.ssh.connect-timeout</name> <value>30000</value> </property> <!-- 如果是true则检查权限,否则不检查(每一个人都可以存取文件)--> <property> <name>dfs.permissions</name> <value>false</value> </property> <!-- 开启namenode失败自动切换 --> <property> <name>dfs.ha.automatic-failover.enabled</name> <value>true</value> </property> <!-- 配置自动切换的方式 --> <property> <name>dfs.client.failover.proxy.provider.bdpha</name> <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value> </property> <!-- hdfs副本数 --> <property> <name>dfs.replication</name> <value>3</value> </property> <!-- HDFS文件系统的元信息保存目录 --> <property> <name>dfs.name.dir</name> <value>/data/bdp/metadata</value> </property> <!-- HDFS文件系统的数据保存目录 --> <property> <name>dfs.data.dir</name> <value>/data1,/data2,/data3,/data4,/data5,/data6,/data7,/data8,/data9,/data10</value> </property> <!-- 当dfs.data.dir中定义的磁盘数量等于设置值时,DataNode启动失败 --> <property> <name>dfs.datanode.failed.volumes.tolerated</name> <value>3</value> </property> </configuration>
1.4mapred-site.xml
<?xml version="1.0" encoding="utf-8"?> <configuration> <!-- 控制当map任务执行到哪个比例的时候就可以开始为reduce task申请资源,默认配置为0.05 --> <property> <name>mapreduce.job.reduce.slowstart.completedmaps</name> <value>0.8</value> </property> <!-- 指定MR运行在Yarn上 --> <property> <name>mapreduce.framework.name</name> <value>yarn</value> </property> <property> <name>mapreduce.map.env</name> <value>LD_LIBRARY_PATH=/usr/local/hadoop/lzo/lib:$LD_LIBRARY_PATH</value> </property> <property> <name>mapreduce.reduce.env</name> <value>LD_LIBRARY_PATH=/usr/local/hadoop/lzo/lib:$LD_LIBRARY_PATH</value> </property> </configuration>
1.5yarn-env.sh
export HADOOP_YARN_USER=${HADOOP_YARN_USER:-yarn}
export YARN_CONF_DIR="${YARN_CONF_DIR:-$HADOOP_YARN_HOME/conf}"
export YARN_PID_DIR=./pids
export YARN_RESOURCEMANAGER_OPTS="-XX:PermSize=256m -XX:MaxPermSize=512m -Xms4096m -Xmx8192m"
export YARN_NODEMANAGER_OPTS="-XX:PermSize=256m -XX:MaxPermSize=512m -Xms4096m -Xmx4096m"
export LD_LIBRARY_PATH=/usr/local/hadoop/lzo/lib:$LD_LIBRARY_PATH
1.6yarn-site.xml
<?xml version="1.0" encoding="utf-8"?> <configuration> <!-- 开启RM HA功能 --> <property> <name>yarn.resourcemanager.ha.enabled</name> <value>true</value> </property> <!-- 标识集群中的RM。如果设置该选项,需要确保所有的RMs在配置中都有自己的id --> <property> <name>yarn.resourcemanager.cluster-id</name> <value>yrc</value> </property> <!-- RMs的逻辑id列表。可以自定义,此处设置为“rm1,rm2”。后面的配置将引用该id --> <property> <name>yarn.resourcemanager.ha.rm-ids</name> <value>rm1,rm2</value> </property> <!-- 指定rm1对应的主机名 --> <property> <name>yarn.resourcemanager.hostname.rm1</name> <value>aaa-master2</value> </property> <!-- 指定rm2对应的主机名 --> <property> <name>yarn.resourcemanager.hostname.rm2</name> <value>aaa-master1</value> </property> <!-- 指定集成的ZooKeeper的服务地址 --> <property> <name>yarn.resourcemanager.zk-address</name> <value>aaa-node1:2181,aaa-node7:2181,aaa-node5:2181</value> </property> <!-- 启用RM重启的功能,默认为false --> <property> <name>yarn.resourcemanager.recovery.enabled</name> <value>true</value> </property> <!-- 数据目录用来存放执行Container所需的数据和运行过程中产生的临时数据,默认配置${hadoop.tmp.dir}/nm-local-dir --> <property> <name>yarn.nodemanager.local-dirs</name> <value>/data1/local/tmp,/data2/local/tmp,/data3/local/tmp,/data4/local/tmp,/data5/local/tmp,/data6/local/tmp,/data7/local/tmp,/data8/local/tmp</value> </property> <!-- 日志目录则用于存放Container运行时输出日志 --> <property> <name>yarn.nodemanager.log-dirs</name> <value>/data1/log/tmp,/data2/log/tmp,/data3/log/tmp,/data4/log/tmp,/data5/log/tmp,/data6/log/tmp,/data7/log/tmp,/data8/log/tmp</value> </property> <!-- 用于状态存储的类,默认为org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore,基于Hadoop文件系统的实现。还可以为org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore,该类为基于ZooKeeper的实现 --> <property> <name>yarn.resourcemanager.store.class</name> <value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value> </property> <!-- NodeManager上运行的附属服务。需配置成mapreduce_shuffle,才可运行MapReduce程序 --> <property> <name>yarn.nodemanager.aux-services</name> <value>mapreduce_shuffle</value> </property> <!-- 是否开启日志聚合 --> <property> <name>yarn.log-aggregation-enable</name> <value>true</value> </property> <!-- 设置该节点上yarn可使用的内存,默认为8G,如果节点内存资源不足8G,要减少这个值,yarn不会智能的去检测内存资源,一般这个设置yarn的可用内存资源 --> <property> <name>yarn.nodemanager.resource.memory-mb</name> <value>24576</value> </property> <!-- 单个任务最小申请物理内存量,默认1024MB --> <property> <name>yarn.scheduler.minimum-allocation-mb</name> <value>2048</value> </property> <!-- 单个任务最大申请物理内存量,默认为8291MB --> <property> <name>yarn.scheduler.maximum-allocation-mb</name> <value>24576</value> </property> <!-- 是否启动一个线程检查每个任务正使用的物理内存量,如果任务超出分配值,则直接将其杀掉,默认是true --> <property> <name>yarn.nodemanager.pmem-check-enable</name> <value>false</value> </property> <!-- 是否启动一个线程检查每个任务正使用的虚拟内存量,如果任务超出分配值,则直接将其杀掉,默认是true --> <property> <name>yarn.nodemanager.vmem-check-enable</name> <value>false</value> </property> <!-- 表示该节点服务器上yarn可以使用的虚拟CPU个数,默认是8,推荐将值配置与物理核心个数相同,如果节点CPU核心不足8个,要调小这个值,yarn不会智能的去检测物理核心数 --> <property> <name>yarn.nodemanager.resource.cpu-vcores</name> <value>32</value> </property> <!-- 单个任务最小可申请的虚拟核心数,默认为1 --> <property> <name>yarn.scheduler.minimum-allocation-vcores</name> <value>1</value> </property> <!-- 单个任务最大可申请的虚拟核心水,默认为4,如果申请资源时,超过这个配置,会抛出InvalidResourceRequestException --> <property> <name>yarn.scheduler.maximum-allocation-vcores</name> <value>32</value> </property> </configuration>
1.7log4j.properties
hadoop.root.logger=warn,console
2.Hive相关配置
2.1hive-site.xml
<?xml version="1.0" encoding="utf-8"?> <configuration> <property> <name>hive.metastore.uris</name> <value>thrift://aaa-master1:9083</value> </property> <property> <name>javax.jdo.option.ConnectionUserName</name> <value>hadoop</value> </property> <property> <name>javax.jdo.option.ConnectionDriverName</name> <value>com.mysql.jdbc.Driver</value> </property> <property> <name>javax.jdo.option.ConnectionURL</name> <value>jdbc:mysql://AAA-MASTER1:3306/hive?createDatabaseIfNotExist=true&useUnicode=true&characterEncoding=latin1</value> </property> <property> <name>hive.server2.long.polling.timeout</name> <value>50000</value> </property> <property> <name>hive.server2.thrift.port</name> <value>10000</value> </property> <property> <name>dfs.permissions</name> <value>false</value> </property> <property> <name>hive.security.authorization.createtable.owner.grants</name> <value>ALL</value> </property> <property> <name>hive.security.authorization.task.factory</name> <value>org.apache.hadoop.hive.ql.parse.authorization.HiveAuthorizationTaskFactoryImpl</value> </property> <property> <name>hive.security.authorization.enabled</name> <value>false</value> </property> <property> <name>javax.jdo.option.ConnectionPassword</name> <value>hadoop0928</value> </property> <property> <name>hive.server2.enable.doAs</name> <value>false</value> </property> <!--<property><name>hive.metastore.execute.setugi</name><value>true</value></property>--> <property> <name>datanucleus.autoCreateSchema</name> <value>true</value> </property> <property> <name>datanucleus.autoCreateColumns</name> <value>true</value> </property> <property> <name>datanucleus.autoCreateTables</name> <value>true</value> </property> </configuration>
2.2hive-env.sh
export HADOOP_HOME=/opt/hadoop
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export HIVE_CONF_DIR=/opt/hive/conf
export HIVE_AUX_JARS_PATH=/opt/hive/lib
export SPARK_HOME=/opt/spark
2.3parquet-logging.properties
.level=WARN
3.Hbase相关配置
3.1hbase-site.xml
<?xml version="1.0" encoding="utf-8"?> <configuration> <property> <name>hbase.zookeeper.quorum</name> <value>aaa-node1:2181,aaa-node7:2181,aaa-node5:2181</value> </property> <property> <name>hbase.coprocessor.region.classes</name> <value>org.apache.hadoop.hbase.security.token.TokenProvider,org.apache.hadoop.hbase.security.access.AccessController</value> </property> <property> <name>hbase.replication</name> <value>true</value> </property> <property> <name>hbase.hstore.compaction.max</name> <value>20</value> </property> <property> <name>replication.source.nb.capacity</name> <value>1000</value> </property> <property> <name>replication.source.ratio</name> <value>1.0</value> </property> <property> <name>hbase.superuser</name> <value>hadoop</value> </property> <property> <name>hbase.cluster.distributed</name> <value>true</value> </property> <property> <name>hbase.rpc.engine</name> <value>org.apache.hadoop.hbase.ipc.SecureRpcEngine</value> </property> <property> <name>zookeeper.session.timeout</name> <value>120000</value> </property> <property> <name>replication.replicationsource.implementation</name> <value>com.ngdata.sep.impl.SepReplicationSource</value> </property> <property> <name>hbase.coprocessor.regionserver.classes</name> <value>org.apache.hadoop.hbase.security.access.AccessController</value> </property> <property> <name>hbase.security.authorization</name> <value>true</value> </property> <property> <name>hbase.hregion.memstore.mslab.enabled</name> <value>true</value> </property> <property> <name>hbase.client.scanner.timeout.period</name> <value>120000</value> </property> <property> <name>hbase.hregion.majorcompaction</name> <value>0</value> </property> <property> <name>hbase.rootdir</name> <value>hdfs://bdpha/hbase</value> </property> <property> <name>hbase.hstore.blockingStoreFiles</name> <value>2100000000</value> </property> <property> <name>hbase.security.authentication</name> <value>simple</value> </property> <property> <name>hbase.hregion.max.filesize</name> <value>21474836480</value> </property> <property> <name>hbase.coprocessor.master.classes</name> <value>org.apache.hadoop.hbase.security.access.AccessController</value> </property> <property> <name>hbase.master.maxclockskew</name> <value>30000</value> </property> <property> <name>hbase.regionserver.handler.count</name> <value>20</value> </property> </configuration>
3.2hbase-env.sh
export JAVA_HOME=/opt/java/jre/ export HBASE_MANAGES_ZK=false export HBASE_CLASSPATH=/opt/hadoop/etc/hadoop export LD_LIBRARY_PATH=/usr/local/hadoop/lzo/lib:$LD_LIBRARY_PATH export HBASE_PID_DIR=/opt/hbase/PID export HBASE_MASTER_OPTS="-XX:PermSize=256m -XX:MaxPermSize=512m -Xms4096m -Xmx8192m" export HBASE_REGIONSERVER_OPTS="-XX:PermSize=256m -XX:MaxPermSize=512m -Xms4096m -Xmx8192m"
3.3log4j-properties
hbase.root.logger=WARN,console
hbase.security.logger= WARN,console
4.Spark相关配置
5.Flink相关配置
6.Solr相关配置
7.Hue相关配置
8.Presto-sever相关配置
9.MySQL相关配置