shell爬虫
#!/bin/bash curl_str='curl -x "http://http-pro.abuyun.com:9010" --proxy-basic --proxy-user H78H42TCN191075P:3D1EA6E4F458AB69' curl_str='curl -L --socks5 socks-cla.abuyun.com:8030 --proxy-user S822RB9T27K96TPC:5E68523C79E62C41' ### encode url encode_url(){ local supplier_name_encode="$(echo "$1" | tr -d '\n' | xxd -plain | sed 's/\(..\)/%\1/g')" echo "https://xin.baidu.com/s?q=${supplier_name_encode}&t=0"|perl -npe 's/\n//' } do_down_web_info_html(){ local url=$(encode_url "$1") local html_file="$2" # wget ${url} -O ${html_file} #echo -e "\n===================================> " ${curl_str} "${url}" \> ${html_file}"\n" ${curl_str} "${url}" > ${html_file} } get_supplier_pid(){ local html_file="${1}.html" local supplier_name="$2" do_down_web_info_html "${supplier_name}" "${html_file}" ### 获取pid echo $(grep compinfo ${html_file}|head -1|awk -F'pid=' '{print $2}'|awk -F'"' '{print $1}') } do_down_supplier_unifiedcode(){ local id="$1" local pid="$2" ### 获取统一社会信用代码 local unifiedCode="${id}_unifiedCode" local api_basic_url="https://xin.baidu.com/detail/basicAjax?pid=${pid}" echo -e "\n 获取统一社会信用代码 ===================================> "${curl_str} "${api_basic_url}" \> ${unifiedCode}"\n" ${curl_str} "${api_basic_url}" > ${unifiedCode} } do_switch_ip(){ # ${curl_str} http://proxy.abuyun.com/switch-ip echo "do_switch_ip" } do_run(){ local id="$1" local supplier_name="$2" pid=$(get_supplier_pid "${id}" "${supplier_name}") do_down_supplier_unifiedcode "${id}" "${pid}" # echo -n $(date "+%F %T")" | ${id} | ${supplier_name} | ${pid} |" ### 解析json为csv文件 # jq -r '[(.data.entName|tostring),(.data.unifiedCode|tostring)]|join("|")' "${id}_unifiedCode" } result_file="result_code" # "上海东福网络科技有限公司广州分公司","120" token(){ local pid=$1 # 判断是否有传入pid if [ -z "${pid}" ] then echo "please input pid" exit 1 fi # 设置并发数,默认为5 local concurrency=20 if [ -n "$2" ] then concurrency=$2 echo "Concurrency: $2" fi # 创建有名管道,如果fd1不存在则创建 [ -e /tmp/fd1 ] || mkfifo /tmp/fd1 # 创建文件描述符,以可读(<)可写(>)的方式关联管道文件,这时候文件描述符999就有了有名管道文件的所有特性 # 为了让程序有一定的扩展性,不想写死fd,因而引入了变量。 # 因而引入eval命令,强制shell进行变量展开。 # eval exec "${fd}>file"简单的说,eval将右边参数整体作为一个命令,进行变量的替换,然后将替换后的输出结果给shell去执行。 eval exec "${pid}<>/tmp/fd1" # 关联后的文件描述符拥有管道文件的所有特性,所以这时候管道文件可以删除,我们留下文件描述符来用就可以了 [ -e /tmp/fd1 ] && rm -f /tmp/fd1 # 初始化并行数 for ((i=1;i<=${concurrency};i++)) do # &999代表引用文件描述符999,这条命令代表往管道里面放入了一个"令牌" echo ${i}>&${pid} done } main(){ local pid=$$ local start_time=`date +%s` # 生成管道文件 token ${pid} local num=0 cat ../tmpa|head -10000|tail -400|while read line do num=$((${num}+1)) if [ ${num} -eq 100 ] then do_switch_ip num=0 echo "==================================== reset num ====================================" fi # 获取令牌 read -u${pid} name { local supplier_info=($(echo ${line}|sed 's/"//g'|sed 's/,/ /')) local id="${supplier_info[1]}" local supplier_name="${supplier_info[0]}" echo "${id} | ${supplier_name}" do_run ${id} ${supplier_name} # | tee -a ${result_file}_${name} echo ${name}>&${pid} } & done wait # 定义脚本运行的结束时间 local stop_time=`date +%s` echo "TIME:`expr ${stop_time} - ${start_time}`" # 关闭文件描述符的读 eval exec "${pid}<&-" # 关闭文件描述符的写 eval exec "${pid}>&-" } main