方法一:使用 cURL(推荐,功能最强大)
<?php
/**
* 使用cURL抓取完整网页
* @param string $url 目标网址
* @param array $options 可选配置
* @return array 返回抓取结果
*/
function fetchWebpageWithCurl($url, $options = []) {
// 初始化cURL
$ch = curl_init();
// 默认配置
$defaultOptions = [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true, // 返回内容而不是直接输出
CURLOPT_FOLLOWLOCATION => true, // 跟随重定向
CURLOPT_MAXREDIRS => 5, // 最大重定向次数
CURLOPT_TIMEOUT => 30, // 超时时间(秒)
CURLOPT_SSL_VERIFYPEER => false, // 跳过SSL验证(仅测试用,生产环境建议开启)
CURLOPT_SSL_VERIFYHOST => false, // 跳过SSL主机验证
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
CURLOPT_HEADER => true, // 返回header信息
CURLOPT_ENCODING => '', // 接受所有编码
];
// 合并自定义选项
$finalOptions = $defaultOptions;
if (!empty($options)) {
foreach ($options as $key => $value) {
$finalOptions[$key] = $value;
}
}
// 设置cURL选项
curl_setopt_array($ch, $finalOptions);
// 执行请求
$response = curl_exec($ch);
// 检查是否有错误
if (curl_error($ch)) {
$error = curl_error($ch);
curl_close($ch);
return [
'success' => false,
'error' => $error
];
}
// 获取HTTP状态码
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
// 分离header和body
$headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$header = substr($response, 0, $headerSize);
$body = substr($response, $headerSize);
curl_close($ch);
return [
'success' => true,
'http_code' => $httpCode,
'header' => $header,
'body' => $body,
'url' => $url
];
}
// 使用示例
$url = 'https://example.com';
$result = fetchWebpageWithCurl($url);
if ($result['success']) {
echo "抓取成功!\n";
echo "HTTP状态码:" . $result['http_code'] . "\n";
echo "网页内容长度:" . strlen($result['body']) . " 字节\n";
echo "网页内容预览:\n" . substr($result['body'], 0, 500) . "...\n";
// 保存到文件
file_put_contents('downloaded_page.html', $result['body']);
echo "网页已保存到 downloaded_page.html\n";
} else {
echo "抓取失败:" . $result['error'] . "\n";
}
?>方法二:使用 file_get_contents(简单但功能有限)
<?php
/**
* 使用file_get_contents抓取网页
* @param string $url 目标网址
* @return array 返回抓取结果
*/
function fetchWebpageWithFileGetContents($url) {
// 创建上下文配置
$options = [
'http' => [
'method' => 'GET',
'header' => [
'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding: gzip, deflate',
'Connection: keep-alive',
],
'timeout' => 30,
'ignore_errors' => true
],
'ssl' => [
'verify_peer' => false,
'verify_peer_name' => false
]
];
$context = stream_context_create($options);
// 尝试抓取网页
try {
// 获取响应头信息
$headers = get_headers($url, 1);
$httpCode = intval(substr($headers[0], 9, 3));
// 获取网页内容
$content = file_get_contents($url, false, $context);
if ($content === false) {
return [
'success' => false,
'error' => '无法获取网页内容'
];
}
return [
'success' => true,
'http_code' => $httpCode,
'headers' => $headers,
'body' => $content,
'url' => $url
];
} catch (Exception $e) {
return [
'success' => false,
'error' => $e->getMessage()
];
}
}
// 使用示例
$url = 'https://example.com';
$result = fetchWebpageWithFileGetContents($url);
if ($result['success']) {
echo "抓取成功!\n";
echo "HTTP状态码:" . $result['http_code'] . "\n";
// 保存到文件
file_put_contents('downloaded_page_fg.html', $result['body']);
echo "网页已保存到 downloaded_page_fg.html\n";
} else {
echo "抓取失败:" . $result['error'] . "\n";
}
?>方法三:增强版cURL抓取(支持更多功能)
<?php
/**
* 增强版网页抓取器
*/
class WebpageFetcher {
private $ch;
private $cookies = [];
private $headers = [];
/**
* 构造函数
*/
public function __construct() {
$this->ch = curl_init();
$this->setDefaultOptions();
}
/**
* 设置默认选项
*/
private function setDefaultOptions() {
$defaultOptions = [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 30,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
CURLOPT_HEADER => true,
CURLOPT_ENCODING => '',
CURLOPT_VERBOSE => false,
];
curl_setopt_array($this->ch, $defaultOptions);
}
/**
* 设置自定义头信息
*/
public function setHeaders($headers) {
$this->headers = array_merge($this->headers, $headers);
curl_setopt($this->ch, CURLOPT_HTTPHEADER, $this->headers);
return $this;
}
/**
* 设置Cookie
*/
public function setCookies($cookies) {
$this->cookies = array_merge($this->cookies, $cookies);
$cookieString = '';
foreach ($this->cookies as $key => $value) {
$cookieString .= "$key=$value; ";
}
curl_setopt($this->ch, CURLOPT_COOKIE, $cookieString);
return $this;
}
/**
* 设置代理
*/
public function setProxy($proxy, $proxyType = CURLPROXY_HTTP) {
curl_setopt($this->ch, CURLOPT_PROXY, $proxy);
curl_setopt($this->ch, CURLOPT_PROXYTYPE, $proxyType);
return $this;
}
/**
* 设置认证信息
*/
public function setAuth($username, $password, $type = CURLAUTH_BASIC) {
curl_setopt($this->ch, CURLOPT_HTTPAUTH, $type);
curl_setopt($this->ch, CURLOPT_USERPWD, "$username:$password");
return $this;
}
/**
* 抓取网页
*/
public function fetch($url) {
curl_setopt($this->ch, CURLOPT_URL, $url);
$response = curl_exec($this->ch);
if (curl_error($this->ch)) {
return [
'success' => false,
'error' => curl_error($this->ch)
];
}
$httpCode = curl_getinfo($this->ch, CURLINFO_HTTP_CODE);
$headerSize = curl_getinfo($this->ch, CURLINFO_HEADER_SIZE);
$header = substr($response, 0, $headerSize);
$body = substr($response, $headerSize);
// 解析响应头
$headers = $this->parseHeaders($header);
return [
'success' => true,
'http_code' => $httpCode,
'headers' => $headers,
'body' => $body,
'url' => $url,
'info' => curl_getinfo($this->ch)
];
}
/**
* 解析响应头
*/
private function parseHeaders($headerString) {
$headers = [];
$lines = explode("\n", trim($headerString));
foreach ($lines as $line) {
if (strpos($line, ':') !== false) {
list($key, $value) = explode(':', $line, 2);
$headers[trim($key)] = trim($value);
}
}
return $headers;
}
/**
* 析构函数
*/
public function __destruct() {
if ($this->ch) {
curl_close($this->ch);
}
}
/**
* 静态方法:快速抓取
*/
public static function quickFetch($url) {
$fetcher = new self();
return $fetcher->fetch($url);
}
}
// 使用示例
$fetcher = new WebpageFetcher();
// 设置自定义选项
$fetcher->setHeaders([
'Accept-Language: zh-CN,zh;q=0.9',
'Cache-Control: no-cache'
])->setCookies([
'session_id' => '123456',
'user_pref' => 'dark_mode'
]);
// 抓取网页
$result = $fetcher->fetch('https://example.com');
if ($result['success']) {
echo "抓取成功!\n";
echo "HTTP状态码:" . $result['http_code'] . "\n";
echo "Content-Type: " . ($result['headers']['Content-Type'] ?? '未知') . "\n";
echo "网页大小:" . strlen($result['body']) . " 字节\n";
// 保存到文件
$filename = 'webpage_' . date('Ymd_His') . '.html';
file_put_contents($filename, $result['body']);
echo "网页已保存到: $filename\n";
// 提取网页标题
if (preg_match('/<title>(.*?)<\/title>/i', $result['body'], $matches)) {
echo "网页标题: " . $matches[1] . "\n";
}
} else {
echo "抓取失败:" . $result['error'] . "\n";
}
// 使用静态方法快速抓取
$quickResult = WebpageFetcher::quickFetch('https://example.com');
?>使用说明
选择合适的方案:
- 如果需要处理复杂的网页、需要设置代理、处理Cookie等,使用方法三(增强版)
- 如果只需要简单抓取,使用方法一(cURL)
- 如果环境不支持cURL,可以使用方法二(file_get_contents)
注意事项:
- 确保PHP开启了相应的扩展(cURL、openssl等)
- 遵守网站的robots.txt规则
- 不要过于频繁地抓取同一网站
- 注意处理编码问题
- 生产环境建议开启SSL验证
常见问题处理:
- 如果遇到编码问题,可以使用
mb_convert_encoding()进行转换 - 如果需要处理JavaScript渲染的页面,可能需要使用无头浏览器(如Puppeteer)
- 对于大文件下载,建议使用流式处理而不是一次性加载到内存
- 如果遇到编码问题,可以使用