采集新浪科技文章代码

浏览:1324 发布日期:2016/06/09 分类:功能实现 关键字: php,代码
新浪科技的文章一键采集ThinkPhp适用代码
/* 新浪科技文章采集 */
public function sina_tech() {
/* NEED CAULL PAGE NUM */
$page_num = intval($_POST['get_post_page_num']);
if (empty($page_num)) $page_num = 1;
/* FIRST COUNT */
$post_count_a = M('post')->count();
/* FOR CULL */
for ($page = 1; $page <= $page_num; $page++) {

$fullpage = CurlGetPage('http://roll.tech.sina.com.cn/s/channel.php?ch=05#col=30&spec=&type=&ch=05&k=&offset_page=0&offset_num=0&num=5&asc=&page='.$page);

preg_match_all('/<div\s+.*\s+class="d_list_txt".*>\s+(.*)\s+<\/div>/Us', $fullpage, $match);
$fullpage = iconv("GB2312", "UTF-8", $match[1][0]);//echo $data1;die;

preg_match_all('/<li>(.*)<\/li>/isU', $fullpage, $in_li_tags);
foreach (array_unique($in_li_tags[1]) as $row) {
/* title */
preg_match_all('/<a href="[^"]*" target="_blank">(.*)<\/a>/', $row, $title);
$title = $title[1][0];
/* link */
preg_match_all('/href="([^"]*)"/', $row, $link);
$link = $link[1][0];
/* DATE */
preg_match_all('/<span class="c_time">(.*)<\/span>/i', $row, $date);
$date = date("Y-", time()) . $date[1][0] . ':00';
// echo $title.' '.$link.' '.$date.'<br/>';

/* GOING THE POST PAGE */
$fullpage_post = CurlGetPage($link);
/* FIX TAGS */
$fullpage_post = preg_replace('/<div class="img_wrapper">(.*)<\/div>/isU', '${1}', $fullpage_post);
$fullpage_post = preg_replace('/<div class="content-page" >(.*)<\/div>/Us', '', $fullpage_post);
//echo htmlspecialchars($fullpage_post);die;

/* POST CONTENT */
preg_match_all('/<div\s+.*\s+id="artibody".*>\s+(.*)\s+<\/div>/Us', $fullpage_post, $post_content);
/* DEL A TAGS */
$post_content = preg_replace("/<a[^>]*>(.*)<\/a>/isU", '${1}', $post_content[1][0]);
// echo '<h1>'.$title.'</h1>'.$url.'<br/>'.$date.'<br/>'.$postCon.'<hr/>';

/* SAVE TO DB */
$post_title_count = M('post')->where("title='$title'")->count();
if ($post_title_count == 0) {
$dataMySql["title"] = $title;
$dataMySql["content"] = $post_content;
$dataMySql["datetime"] = $date;
M('post')->add($dataMySql);
}
}
}
/* LAST COUNT */
$post_count_b = M('post')->count();
$post_add_num = $post_count_b - $post_count_a;
/* CALLBACK */
if ($post_count_a == $post_count_b) {
echo '{"success":1,"msg":"文章数无变化"}';
} else {
echo '{"success":1,"msg":"成功采集 ' . $post_add_num . ' 篇文章"}';
}
}
评论( 相关
后面还有条评论,点击查看>>