PHP8.1.21版本已发布
vue8.1.21版本已发布
jquery8.1.21版本已发布

抓取“IC 交易网”供应商程序

原创
2016-06-13 10:47:23 1362浏览

/**
* 抓取“IC 交易网”供应商主程序
* author Lee.
* Last modify $Date: 2012-2-6 10:44:32$
* 注:本程序按照编码 GB2312 执行,因为“IC 交易网”网站是GB2312编码,数据库也得保持一致
*/
class ic {
private $key; // 型号
private $pageNum; // 页码

/**
* 入口程序
*/
public function go($key) {
$this->key = $key;
$this->pageNum = $this->getPageNum();
$this->getInfo();
}

/**
* 获取供应商 url 链接数组
* @return ArrayObject
*/
private function getInfo() {
if ($this->pageNum==1) { # 处理只有一页的情况
$arr = $this->shopUrlMatchReArr($this->getContent());
$this->isAddSuccess($arr);
} elseif ($this->pageNum>1) { # 多页
for ($i=1; $ipageNum; $i++) {
$arr = $this->shopUrlMatchReArr($this->getContent($i));
$this->isAddSuccess($arr);
}
}
}

/**
* 打印是否添加成功
* @param ArrayObject $arr
* @return string
*/
private function isAddSuccess($arr) {
foreach ($arr as $k=>$v) {
if ($this->execAdd($this->getInfoByShopUrl($v))) {
echo 'Add Success!!';
} else {
echo 'Add Faild!!';
}
}
}

/**
* 执行添加到数据库
* @param ArrayObject $infoArr
* @return Number 受影响的行数
*/
private function execAdd($infoArr) {
$mysqli = $this->getDb();
if (!emptyempty($infoArr['company'])) {
if (!$this->isExists($mysqli, $infoArr)) {
$num = $mysqli->query("INSERT INTO ic(company,address,phone,mobile,fax,zip,person,qq,msn,email,website,regDate,shopUrl) VALUES ('{$infoArr['company']}','{$infoArr['address']}','{$infoArr['phone']}','{$infoArr['mobile']}','{$infoArr['fax']}','{$infoArr['zip']}','{$infoArr['person']}','{$infoArr['qq']}','{$infoArr['msn']}','{$infoArr['email']}','{$infoArr['website']}','{$infoArr['regDate']}','{$infoArr['shopUrl']}')");
return $num;
} else {
return false; # 表示数据已经存在
}
} else {
return false;
}
}

/**
* 连接数据库
*/
private function getDb() {
$mysqli = new mysqli('localhost', 'root', '1715544', 'weiku');
$mysqli->query('SET NAMES GB2312');
return $mysqli;
}

/**
* 检查公司是否已经存在
* @param Resource $mysqli
* @param ArrayObject $infoArr
* @return bool
*/
private function isExists($mysqli, $infoArr) {
$mysqli->query("SELECT company FROM ic WHERE company = '{$infoArr['company']}'");
if ($mysqli->affected_rows) {
return true;
} else {
return false;
}
}

/**
* 格式化字符串
* @param string $str
* @return string
*/
private function formatString($str) {
return trim($str);
}

/**
* 抓取信息
* @param $url
* @return ArrayObject
*/
private function getInfoByShopUrl($url) {
$re = $this->getUrlInfo($url);
if (stristr($re, '')) $re = preg_replace('/.*/Usi', '', $re);
preg_match_all('/(.+)/Usi', $re, $companyArr); <br> preg_match_all('/地址:(.*)/Usi', $re, $addressArr); <br> preg_match_all('/电话:(.*)/Usi', $re, $phoneArr); <br> preg_match_all('/手机:(.*)/Usi', $re, $mobileArr); <br> preg_match_all('/传真:(.*)/Usi', $re, $faxArr); <br> preg_match_all('/邮编:(.*)/Usi', $re, $zipArr); <br> preg_match_all('/联系人:(.*)/Usi', $re, $personArr); <br> preg_match_all('/QQ:(.*)/Usi', $re, $qqArr); <br> preg_match_all('/MSN:(.*)/Usi', $re, $msnArr); <br> preg_match_all('/Email:(.*)/Usi', $re, $emailArr); <br> preg_match_all('/网址:(.*)/Usi', $re, $websiteArr); <br> preg_match_all('/注册日期:(.*)/Usi', $re, $regDateArr); <br> $infoArr = array( <br> 'company'=>$this->formatString($companyArr[1][0]), <br> 'address'=>$this->formatString($addressArr[1][0]), <br> 'phone'=>$this->formatString($phoneArr[1][0]), <br> 'mobile'=>$this->formatString($mobileArr[1][0]), <br> 'fax'=>$this->formatString($faxArr[1][0]), <br> 'zip'=>$this->formatString($zipArr[1][0]), <br> 'person'=>$this->formatString($personArr[1][0]), <br> 'qq'=>$this->formatString($qqArr[1][0]), <br> 'msn'=>$this->formatString($msnArr[1][0]), <br> 'email'=>$this->formatString($emailArr[1][0]), <br> 'website'=>$this->stripATags($this->formatString($websiteArr[1][0])), <br> 'regDate'=>$this->formatString($regDateArr[1][0]), <br> 'shopUrl'=>$url <br> ); <br> return $infoArr; <br> } <br> <br> /**<br> * 根据页面获取供应商 url 数组<br> * @param string $re<br> * @return ArrayObject<br> */ <br> private function shopUrlMatchReArr($re) { <br> preg_match_all('/<a onmousemove='\".+\"' onmouseout="hidetip\(\)" href="//m.sbmmt.com/m/faq/(.+)" target="\_blank">.+/Usi', $re, $arr); <br> $arr = $this->formatUrlArr(array_unique($arr[1])); <br> return $arr; <br> } <br> <br> /**<br> * 格式化数组<br> * @param Array $arr<br> * @return ArrayObject<br> */ <br> private function formatUrlArr($arr) { <br> $newArr = array(); <br> foreach ($arr as $key=>$value) { <br> if ($this->isExistsHttp($value)) { <br> $newArr[$key] = $value; <br> } <br> } <br> return $newArr; <br> } <br> <br> /**<br> * 格式化 QQ<br> * @param string $str<br> * @return string<br> */ <br> private function formatQqMsn($str, $e='QQ') { <br> if (emptyempty($str)) return ''; <br> preg_match_all('/alt="'.$e.'\:(.+)"/Usi', $str, $arr); <br> if (count($arr[1])==1) return $arr[1][0]; <br> $newStr = null; <br> foreach ($arr[1] as $value) { <br> $newStr .= $value . ' '; <br> } <br> return rtrim($newStr, ' '); <br> } <br> <br> /**<br> * 去掉网址的 A 标签<br> * @param string $site<br> * @return string<br> */ <br> private function stripATags($site) { <br> $site = preg_replace('/</a><a.>(.+)/', '\1', $site); <br> return $site; <br> } <br> <br> /**<br> * 检查 url 是否有 http<br> * @param string $url<br> * @return bool<br> */ <br> private function isExistsHttp($url) { <br> if (stristr($url, 'http://')) { <br> return true; <br> } else { <br> return false; <br> } <br> } <br> <br> /**<br> * 获取页面内容<br> * @param Number $page<br> * @return string<br> */ <br> private function getContent($page=1) { <br> $re = file_get_contents($this->getUrl($this->key, $page)); <br> return $re; <br> } <br> <br> /**<br> * 获取页码<br> * @return Number<br> */ <br> private function getPageNum() { <br> $i = null; <br> $re = $this->getContent(); <br> preg_match_all('/共(.+)页/Usi', $re, $arr); <br> $i = $arr[1][0]; <br> return $i; <br> } <br> <br> /**<br> * 获取 URL 链接<br> * @param string $str<br> * @param int $page 页码<br> * @return string<br> */ <br> private function getUrl($str, $page=1) { <br> return "http://www.ic.net.cn/partsearch/searchinstock.asp?newtype=1&area=&Page={$page}&partnumber={$str}&mfg=&DateCode=&QTY=&PRICE=&Exact=&orderby=inputdate&qty_filter=50&usertype2=1&pack="; <br> } <br> <br> /**<br> * 获取页面内容<br> * @param string $url<br> * @return string<br> */ <br> private function getUrlInfo($url) { <br> $re = file_get_contents($url); <br> return $re; <br> } <br> } <br> <br> /*<br> 程序运行思路:根据“IC 交易网”的IC搜索功能,输入型号进行搜索,然后抓取供应商信息<br> <br> 数据库结构<br> CREATE TABLE `ic` (<br> `id` mediumint(8) unsigned NOT NULL auto_increment,<br> `company` varchar(500) NOT NULL,<br> `address` varchar(500) default NULL,<br> `phone` varchar(500) default NULL,<br> `mobile` varchar(500) default NULL,<br> `fax` varchar(300) default NULL,<br> `zip` varchar(300) default NULL,<br> `person` varchar(500) default NULL,<br> `qq` varchar(300) default NULL,<br> `msn` varchar(300) default NULL,<br> `email` varchar(500) default NULL,<br> `website` varchar(300) default NULL,<br> `regDate` varchar(500) default NULL,<br> PRIMARY KEY (`id`)<br> ) ENGINE=InnoDB DEFAULT CHARSET=gb2312<br> */ <br> <br> $i = new ic(); <br> $arr = array_unique(array('MAX3232', 'AML8613', 'MT6225A', 'OM8373PS/N3/A', 'PT7313', 'MAX8212ESA', 'TL431', 'S3C2440', 'TMS320F2812PGFA', 'PCM1704', 'AN6717', 'CA3162E', 'CA3161E', 'LM393N', 'DS18B20', 'SHT10', 'AML8613', 'AN6717', 'LM393N', 'CA3161E', 'CA3162E', 'PCM1704', 'STK392-040', 'K1667', 'MAX232', 'STM32F103', 'LM358')); <br> foreach ($arr as $v) { <br> $i->go($v); <br> } <br> ?> <br><?php <br /> /**<br> * 抓取“IC 交易网”供应商主程序<br> * author Lee.<br> * Last modify $Date: 2012-2-6 10:44:32$<br> * 注:本程序按照编码 GB2312 执行,因为“IC 交易网”网站是GB2312编码,数据库也得保持一致<br> */<br> class ic {<br> private $key; // 型号<br> private $pageNum; // 页码</a.>

/**
* 入口程序
*/
public function go($key) {
$this->key = $key;
$this->pageNum = $this->getPageNum();
$this->getInfo();
}

/**
* 获取供应商 url 链接数组
* @return ArrayObject
*/
private function getInfo() {
if ($this->pageNum==1) { # 处理只有一页的情况
$arr = $this->shopUrlMatchReArr($this->getContent());
$this->isAddSuccess($arr);
} elseif ($this->pageNum>1) { # 多页
for ($i=1; $ipageNum; $i++) {
$arr = $this->shopUrlMatchReArr($this->getContent($i));
$this->isAddSuccess($arr);
}
}
}

/**
* 打印是否添加成功
* @param ArrayObject $arr
* @return string
*/
private function isAddSuccess($arr) {
foreach ($arr as $k=>$v) {
if ($this->execAdd($this->getInfoByShopUrl($v))) {
echo 'Add Success!!';
} else {
echo 'Add Faild!!';
}
}
}

/**
* 执行添加到数据库
* @param ArrayObject $infoArr
* @return Number 受影响的行数
*/
private function execAdd($infoArr) {
$mysqli = $this->getDb();
if (!empty($infoArr['company'])) {
if (!$this->isExists($mysqli, $infoArr)) {
$num = $mysqli->query("INSERT INTO ic(company,address,phone,mobile,fax,zip,person,qq,msn,email,website,regDate,shopUrl) VALUES ('{$infoArr['company']}','{$infoArr['address']}','{$infoArr['phone']}','{$infoArr['mobile']}','{$infoArr['fax']}','{$infoArr['zip']}','{$infoArr['person']}','{$infoArr['qq']}','{$infoArr['msn']}','{$infoArr['email']}','{$infoArr['website']}','{$infoArr['regDate']}','{$infoArr['shopUrl']}')");
return $num;
} else {
return false; # 表示数据已经存在
}
} else {
return false;
}
}

/**
* 连接数据库
*/
private function getDb() {
$mysqli = new mysqli('localhost', 'root', '1715544', 'weiku');
$mysqli->query('SET NAMES GB2312');
return $mysqli;
}

/**
* 检查公司是否已经存在
* @param Resource $mysqli
* @param ArrayObject $infoArr
* @return bool
*/
private function isExists($mysqli, $infoArr) {
$mysqli->query("SELECT company FROM ic WHERE company = '{$infoArr['company']}'");
if ($mysqli->affected_rows) {
return true;
} else {
return false;
}
}

/**
* 格式化字符串
* @param string $str
* @return string
*/
private function formatString($str) {
return trim($str);
}

/**
* 抓取信息
* @param $url
* @return ArrayObject
*/
private function getInfoByShopUrl($url) {
$re = $this->getUrlInfo($url);
if (stristr($re, '')) $re = preg_replace('/.*/Usi', '', $re);
preg_match_all('/(.+)/Usi', $re, $companyArr);<br> preg_match_all('/地址:(.*)/Usi', $re, $addressArr);<br> preg_match_all('/电话:(.*)/Usi', $re, $phoneArr);<br> preg_match_all('/手机:(.*)/Usi', $re, $mobileArr);<br> preg_match_all('/传真:(.*)/Usi', $re, $faxArr);<br> preg_match_all('/邮编:(.*)/Usi', $re, $zipArr);<br> preg_match_all('/联系人:(.*)/Usi', $re, $personArr);<br> preg_match_all('/QQ:(.*)/Usi', $re, $qqArr);<br> preg_match_all('/MSN:(.*)/Usi', $re, $msnArr);<br> preg_match_all('/Email:(.*)/Usi', $re, $emailArr);<br> preg_match_all('/网址:(.*)/Usi', $re, $websiteArr);<br> preg_match_all('/注册日期:(.*)/Usi', $re, $regDateArr);<br> $infoArr = array(<br> 'company'=>$this->formatString($companyArr[1][0]),<br> 'address'=>$this->formatString($addressArr[1][0]),<br> 'phone'=>$this->formatString($phoneArr[1][0]),<br> 'mobile'=>$this->formatString($mobileArr[1][0]),<br> 'fax'=>$this->formatString($faxArr[1][0]),<br> 'zip'=>$this->formatString($zipArr[1][0]),<br> 'person'=>$this->formatString($personArr[1][0]),<br> 'qq'=>$this->formatString($qqArr[1][0]),<br> 'msn'=>$this->formatString($msnArr[1][0]),<br> 'email'=>$this->formatString($emailArr[1][0]),<br> 'website'=>$this->stripATags($this->formatString($websiteArr[1][0])),<br> 'regDate'=>$this->formatString($regDateArr[1][0]),<br> 'shopUrl'=>$url<br> );<br> return $infoArr;<br> }

/**
* 根据页面获取供应商 url 数组
* @param string $re
* @return ArrayObject
*/
private function shopUrlMatchReArr($re) {
preg_match_all('/.+/Usi', $re, $arr);
$arr = $this->formatUrlArr(array_unique($arr[1]));
return $arr;
}

/**
* 格式化数组
* @param Array $arr
* @return ArrayObject
*/
private function formatUrlArr($arr) {
$newArr = array();
foreach ($arr as $key=>$value) {
if ($this->isExistsHttp($value)) {
$newArr[$key] = $value;
}
}
return $newArr;
}

/**
* 格式化 QQ
* @param string $str
* @return string
*/
private function formatQqMsn($str, $e='QQ') {
if (empty($str)) return '';
preg_match_all('/alt="'.$e.'\:(.+)"/Usi', $str, $arr);
if (count($arr[1])==1) return $arr[1][0];
$newStr = null;
foreach ($arr[1] as $value) {
$newStr .= $value . ' ';
}
return rtrim($newStr, ' ');
}

/**
* 去掉网址的 A 标签
* @param string $site
* @return string
*/
private function stripATags($site) {
$site = preg_replace('/(.+)/', '\1', $site);
return $site;
}

/**
* 检查 url 是否有 http
* @param string $url
* @return bool
*/
private function isExistsHttp($url) {
if (stristr($url, 'http://')) {
return true;
} else {
return false;
}
}

/**
* 获取页面内容
* @param Number $page
* @return string
*/
private function getContent($page=1) {
$re = file_get_contents($this->getUrl($this->key, $page));
return $re;
}

/**
* 获取页码
* @return Number
*/
private function getPageNum() {
$i = null;
$re = $this->getContent();
preg_match_all('/共(.+)页/Usi', $re, $arr);
$i = $arr[1][0];
return $i;
}

/**
* 获取 URL 链接
* @param string $str
* @param int $page 页码
* @return string
*/
private function getUrl($str, $page=1) {
return "http://www.ic.net.cn/partsearch/searchinstock.asp?newtype=1&area=&Page={$page}&partnumber={$str}&mfg=&DateCode=&QTY=&PRICE=&Exact=&orderby=inputdate&qty_filter=50&usertype2=1&pack=";
}

/**
* 获取页面内容
* @param string $url
* @return string
*/
private function getUrlInfo($url) {
$re = file_get_contents($url);
return $re;
}
}

/*
程序运行思路:根据“IC 交易网”的IC搜索功能,输入型号进行搜索,然后抓取供应商信息

数据库结构
CREATE TABLE `ic` (
`id` mediumint(8) unsigned NOT NULL auto_increment,
`company` varchar(500) NOT NULL,
`address` varchar(500) default NULL,
`phone` varchar(500) default NULL,
`mobile` varchar(500) default NULL,
`fax` varchar(300) default NULL,
`zip` varchar(300) default NULL,
`person` varchar(500) default NULL,
`qq` varchar(300) default NULL,
`msn` varchar(300) default NULL,
`email` varchar(500) default NULL,
`website` varchar(300) default NULL,
`regDate` varchar(500) default NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=gb2312
*/

$i = new ic();
$arr = array_unique(array('MAX3232', 'AML8613', 'MT6225A', 'OM8373PS/N3/A', 'PT7313', 'MAX8212ESA', 'TL431', 'S3C2440', 'TMS320F2812PGFA', 'PCM1704', 'AN6717', 'CA3162E', 'CA3161E', 'LM393N', 'DS18B20', 'SHT10', 'AML8613', 'AN6717', 'LM393N', 'CA3161E', 'CA3162E', 'PCM1704', 'STK392-040', 'K1667', 'MAX232', 'STM32F103', 'LM358'));
foreach ($arr as $v) {
$i->go($v);
}
?>


摘自 Lee.的专栏
声明:本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系admin@php.cn核实处理。