1 <?php
2 /**
3 * 抓取“华强电子网”供应商主程序
4 * author Lee.
5 * Last modify $Date: 2012-2-2 12:55:35 $
6 */
7 require_once './config.inc.php';
8 class huaqiang {
9 private $key; // 型号
10 private $pageNum; // 页码
11
12 /**
13 * 入口程序
14 */
15 public function go($key) {
16 $this->key = $key;
17 if ($this->checkIsExistsData()) {
18 $this->pageNum = $this->getPageNum();
19 $this->getInfo();
20 }
21 }
22
23 /**
24 * 获取页面内容
25 * @param Number $page
26 * @return string
27 */
28 private function getContent($page=1) {
29 $re = file_get_contents($this->getUrl($this->key, $page));
30 return $re;
31 }
32
33 /**
34 * 检查第一页是否有数据
35 * @return 有返回 true;无返回 false
36 */
37 private function checkIsExistsData() {
38 if (stristr($this->getContent(), '<span class="s_curr g_vm">1</span>')) {
39 return true;
40 } else {
41 return false;
42 }
43 }
44
45 /**
46 * 获取供应商 url 链接数组
47 * @return ArrayObject
48 */
49 private function getInfo() {
50 if ($this->pageNum==1) { # 处理只有一页的情况
51 $arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent()));
52 # 循环抓取信息
53 foreach ($arr as $k=>$v) {
54 $infoArr = $this->getInfoByShopUrl($v);
55 if ($this->execAdd($infoArr)) echo 'Add Success!!';
56 $this->sleep();
57 }
58 $this->sleep();
59 } elseif ($this->pageNum>1) { # 多页
60 for ($i=1; $i<=$this->pageNum; $i++) {
61 $arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent($i)));
62 # 循环抓取信息
63 foreach ($arr as $k=>$v) {
64 $infoArr = $this->getInfoByShopUrl($v);
65 if ($this->execAdd($infoArr)) echo 'Add Success!!';
66 $this->sleep();
67 }
68 $this->sleep();
69 }
70 }
71 }
72
73 /**
74 * 执行添加数据库
75 * @param array $infoArr
76 * @return Number 是否添加成功
77 */
78 private function execAdd($infoArr) {
79 $m = new Model();
80 if (!$m->isExists('huaqiang', "company='{$infoArr['company']}'")) {
81 $num = $m->insert('huaqiang', array('company','mobile','phone','fax','region','address','website','zip','email','qq','msn','market','shopUrl'), array($infoArr['company'],$infoArr['mobile'],$infoArr['phone'],$infoArr['fax'],$infoArr['region'],$infoArr['address'],$infoArr['website'],$infoArr['zip'],$infoArr['email'],$infoArr['qq'],$infoArr['msn'],$infoArr['market'],$infoArr['shopUrl']));
82 }
83 return $num;
84 }
85
86 /**
87 * 抓取信息
88 * @param $url
89 * @return ArrayObject
90 */
91 private function getInfoByShopUrl($url) {
92 $re = $mobileRe = $faxRe = $marketRe = $msnRe = $zipRe = $urlRe = $emailRe = $qqRe = $this->getUrlInfo($url);
93 preg_match_all('/<li class=\"g\_fl tit\">公司名称:<\/li><li class="g_fl cont">(.+)<\/li>.+<li class="g_fl tit">电话:<\/li><li class=\"g\_fl cont\">(.+)<\/li>.+<li class=\"g\_fl tit\">所在地区:<\/li><li class=\"g\_fl cont\">(.+)<\/li>.+<li class=\"g\_fl tit\">详细地址:<\/li><li class=\"g\_fl cont\">(.+)<\/li>.+<li class=\"g\_fl tit\">QQ:<\/li><li class=\"g\_fl cont\">(.+)<\/li>/Usi', $re, $shopArr);
94 preg_match_all('/<li class=\"g\_fl tit\">手机:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $mobileRe, $mobileArr);
95 preg_match_all('/<li class=\"g\_fl tit\">传真:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $faxRe, $faxArr);
96 preg_match_all('/<li class=\"g\_fl tit\">网址:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $urlRe, $urlArr);
97 preg_match_all('/<li class=\"g\_fl tit\">MSN:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $msnRe, $msnArr);
98 preg_match_all('/<li class=\"g\_fl tit\">邮政编码:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $zipRe, $zipArr);
99 preg_match_all('/<li class=\"g\_fl tit\">所属电子市场:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $marketRe, $marketArr);
100 preg_match_all('/<li class=\"g\_fl tit\">电子邮箱:<\/li><li class=\"g\_fl cont cor\">(.*)<\/li>/Usi', $emailRe, $emailArr);
101 preg_match_all('/<li class=\"g\_fl tit\">QQ:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $qqRe, $qqArr);
102 $infoArr = array(
103 'company'=>trim($shopArr[1][0]),
104 'mobile'=>empty($mobileArr[1][0]) ? '' : $mobileArr[1][0],
105 'phone'=>$this->stripPhoneTags(trim($shopArr[2][0])),
106 'fax'=>empty($faxArr[1][0]) ? '' : $faxArr[1][0],
107 'region'=>trim($shopArr[3][0]),
108 'address'=>trim($shopArr[4][0]),
109 'zip'=>empty($zipArr[1][0]) ? '' : $zipArr[1][0],
110 'email'=>empty($emailArr[1][0]) ? '' : $emailArr[1][0],
111 'qq'=>empty($qqArr[1][0]) ? '' : $qqArr[1][0],
112 'msn'=>empty($msnArr[1][0]) ? '' : $msnArr[1][0],
113 'market'=>empty($marketArr[1][0]) ? '' : $marketArr[1][0],
114 'website'=>empty($urlArr[1][0]) ? '' : $this->stripATags($urlArr[1][0]),
115 'shopUrl'=>$url
116 );
117 return $infoArr;
118 }
119
120 /**
121 * 供应商店铺链接添加 contact.html
122 * @param array $arr
123 * @return string
124 */
125 private function shopAddContact($arr) {
126 foreach ($arr as $k=>$v) {
127 $arr[$k] = $v . '/contact.html';
128 }
129 return $arr;
130 }
131
132 /**
133 * 去掉网址的 A 标签
134 * @param string $site
135 * @return string
136 */
137 private function stripATags($site) {
138 $site = preg_replace('/<a.+>(.+)<\/a>/', '\1', $site);
139 return $site;
140 }
141
142 /**
143 * 去掉手机多余标签
144 * @param string $phone
145 * @return string
146 */
147 private function stripPhoneTags($phone) {
148 $phone = str_replace('<span>', '', $phone);
149 $phone = str_replace('</span>', ' ', $phone);
150 $phone = str_replace('<br />', '', $phone);
151 return $phone;
152 }
153
154 /**
155 * 根据页面获取供应商 url 数组
156 * @param string $re
157 * @return ArrayObject
158 */
159 private function shopUrlMatchReArr($re) {
160 preg_match_all('/<li class="col3"><a class=\"company\" target=\"\_blank\" href=\"(.+)\" value=\".+\">.+<\/a>/Usi', $re, $arr);
161 $arr = array_unique($arr[1]);
162 return $arr;
163 }
164
165 /**
166 * 获取页码
167 * @return Number
168 */
169 private function getPageNum() {
170 $i = 1;
171 while (true) {
172 $re = $this->getContent($i);
173 if (stristr($re, '<span class="g_vm s_f0f s_f0f1" title="下一页">')) break;
174 $i++;
175 $this->sleep();
176 }
177 return $i;
178 }
179
180 /**
181 * 获取 URL 链接
182 * @param string $str
183 * @param int $page 页码
184 * @return string
185 */
186 private function getUrl($str, $page=1) {
187 return "http://www.hqew.com/ic/{$str}_____0_00_0_{$page}.html";
188 }
189
190 /**
191 * 获取页面内容
192 * @param string $url
193 * @return string
194 */
195 private function getUrlInfo($url) {
196 $re = file_get_contents($url);
197 return $re;
198 }
199
200 /**
201 * 休眠时间,默认5秒
202 */
203 private function sleep($seconds=5) {
204 sleep($seconds);
205 }
206 }
207 /**
208 * 使用方法:1、先实例化一个类;2、调用 go($param) 方法,$param 为型号
209 * 程序运行思路:根据“华强电子网”的IC搜索功能,输入型号进行搜索,然后抓取供应商信息
210 */
211 /**
212 * 数据库结构
213 *
214 CREATE TABLE `huaqiang` (
215 `id` mediumint(8) unsigned NOT NULL auto_increment,
216 `company` varchar(500) NOT NULL,
217 `mobile` varchar(500) NOT NULL,
218 `phone` varchar(500) NOT NULL,
219 `fax` varchar(500) NOT NULL,
220 `region` varchar(500) NOT NULL,
221 `address` varchar(500) NOT NULL,
222 `website` varchar(200) NOT NULL,
223 `zip` varchar(100) NOT NULL,
224 `email` varchar(500) NOT NULL,
225 `qq` varchar(200) NOT NULL,
226 `msn` varchar(200) NOT NULL,
227 `market` varchar(500) NOT NULL,
228 `shopUrl` varchar(200) NOT NULL,
229 PRIMARY KEY (`id`)
230 ) ENGINE=InnoDB DEFAULT CHARSET=utf8
231 */
232 $c = new huaqiang();
233 $arr = array_unique(array('MAX3232', 'AML8613', 'MT6225A', 'OM8373PS/N3/A', 'PT7313', 'MAX8212ESA', 'TL431', 'S3C2440', 'TMS320F2812PGFA', 'PCM1704', 'AN6717', 'CA3162E', 'CA3161E', 'LM393N', 'DS18B20', 'SHT10', 'AML8613', 'AN6717', 'LM393N', 'CA3161E', 'CA3162E', 'PCM1704', 'STK392-040', 'K1667', 'MAX232', 'STM32F103', 'LM358'));
234 foreach ($arr as $v) {
235 $c->go($v);
236 }
237 ?>