CURL+DOM采集小样

浏览:2592 发布日期:2015/01/28 分类:用法示例 关键字: DOMdocument curl
个人感觉效率非常高的采集方式,但DOMdocument好像无法采集到带标签的内容,采集纯文本非常快.
大部分功能都没来得及做就被否了,只能提供大家参考用.
可以看到CURL代理访问+ZEND_DOM采集用法.
主要看CollectGoodsController.class.php就可以了.//获取店铺栏目
    public function getShopCate($shop_url="",$continue=0){
        $source=file_get_contents('Offline/shops.htm');
        $shops_id=1;
        $mall_id=1;
        $cate=M('goods_category')->where(array('shops_id'=>$shops_id))->find();
        if(!empty($cate)){
            return fasle;//采集店铺栏目已存在
            //$this->error('采集店铺栏目已存在');
        }

        import('@.Tao.TaoHttp','','.php');
        $Http= new \TaoHttp();
        $shop_html=$Http->encoding($source);

        $shop_category_rule=D('CollectGoods')->getRule($mall_id,'shop_category');
        import('@.Tao.Dom.Query','','.php');
        $Dom= new \Zend_Dom_Query($shop_html);
        $shop_category=$Dom->query($shop_category_rule);
        if(count($shop_category)==0){
            return false;//采集不到店铺栏目
            //$this->error('采集不到店铺栏目');
        }
        $result=array();
        foreach ($shop_category as $key => $value) {
            $result[$key]['url']=$value->getAttribute('href');
            $result[$key]['name']=trim($value->nodeValue);
        }
        unset($result[0]);
        
        $data=array();
        $time=time();
        $cate_url=array();
        foreach ($result as $value) {
            $dataTmp=array(
                'shops_id'=> $shops_id,
                'cate_name'=> $value['name'],
                'cate_url'=> $value['url'],
                'collect_time'=>$time,
            );
            $cate_url[]=$value['url'];
            $data[]=$dataTmp;
        }
        M('goods_category')->addAll($data);
        return true;//采集店铺栏目成功
    }
    //获取店铺一个栏目商品
    //http://localhost/TaoGoods/index.php?m=Taogoods&c=CollectGoods&a=getShopGoods&cate_id=3
    public function getShopGoods($cate_id=0){
        if($cate_id==0){return false;}
        $goods_time=M('goods')->where(array('cate_id'=>$cate_id))->getField('collect_time');

        if($goods_time){
            if($goods_time + 86400*$this->day > time()){
                $this->error('15天内请勿重复采集',U('index'));
            }
            $this->error('采集店铺栏目下货品已存在',U('index'));
        }

        $cate_data=M('goods_category')->find($cate_id);
        $shops_id=$cate_data['shops_id'];
        $cate_id=$cate_data['id'];
        $mall_id=$cate_data['mall_id'];

        import('@.Tao.TaoHttp','','.php');
        $Http= new \TaoHttp();
        $source=$Http->get($cate_data['cate_url']);
        $shop_html=$Http->encoding($source);

        $cate_rule=D('CollectGoods')->getRule($mall_id);
        import('@.Tao.Dom.Query','','.php');
        $Dom= new \Zend_Dom_Query($shop_html);
        $cate_imgs=$Dom->query($cate_rule['shop_category_goods_img']);
        $cate_names=$Dom->query($cate_rule['shop_category_goods_name']);
        $cate_sales=$Dom->query($cate_rule['shop_category_goods_sale']);
        $cate_cprices=$Dom->query($cate_rule['shop_category_goods_cprice']);
        //$cate_sprices=$Dom->query($cate_rule['shop_category_goods_sprice']);
        
        $num=count($cate_names);

        $time=time();
        $result=array();
        for ($i=0; $i <$num ; $i++) {
            $result[$i]['goods_thumb']=$cate_imgs->bykey($i)->getAttribute('src');
            $result[$i]['goods_name']=$cate_names->bykey($i)->nodeValue;
            $result[$i]['goods_url']=$cate_names->bykey($i)->getAttribute('href');
            $result[$i]['goods_cprice']=$cate_cprices->bykey($i)->nodeValue;
            $result[$i]['goods_sale']=$cate_sales->bykey($i)->nodeValue;
            //$result[$i]['goods_spirce']=$cate_sprices->bykey($i)->nodeValue;
            $result[$i]['mall_id']=$mall_id;
            $result[$i]['shops_id']=$shops_id;
            $result[$i]['cate_id']=$cate_id;
            $result[$i]['collect_time']=$time;
        }

        if(M('goods')->addAll($result)){
            $this->success('采集店铺栏目下货品成功',U('index'));
        }
    /**
     * 判断网页数据,转GBK等到UTF-8
     */ 
    public function encoding($source){
        $encode = mb_detect_encoding($source, array("GBK","UTF-8","GB2312","BIG5"));
        if($encode=='CP936'){
            $source=iconv("GBK", "UTF-8//IGNORE", $source);
            //$meta用于DOM判断编码
            $meta = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">';
            $source=$meta.$source;
        }
        return $source;
    }
测试:只有这两个按钮能用,其他的都不能用


测试的话 可以将goods表清空 点击采集货品
sql文件在压缩包里

BY:悠悠山雨

附件 Taogoods.zip ( 2.2 MB 下载:97 次 )

评论( 相关
后面还有条评论,点击查看>>