66资源网技术分享视频教程 10天前

Python网络爬取并且写入到文件里面

核心代码：

代码段1：

# 提取所有 <a> 标签中的 href 链接
for link in soup.find_all(“a”):
href = link.get(“href”)

# 过滤无效链接
if not href or href.startswith((“#”, “javascript:”, “mailto:”)):
continue

# 拼接完整 URL
full_url = f”{url.rstrip(‘/’)}/{href.lstrip(‘/’)}”
collected_links.append(full_url)

代码段2：

def save_links_to_file(links, file_path, buffer_size=1000):
seen = set()
buffer = []

# 读取已有链接，防止重复写入
try:
with open(file_path, “r”, encoding=”utf-8″) as f:
seen.update(line.strip() for line in f)
except FileNotFoundError:
pass

# 处理每个链接
for link in links:
if link in seen:
continue
seen.add(link)
buffer.append(f”{link}\n”)

# 缓存满则写入文件
if len(buffer) >= buffer_size:
with open(file_path, “a”, encoding=”utf-8″) as f:
f.writelines(buffer)
buffer = []

# 写入剩余链接
if buffer:
with open(file_path, “a”, encoding=”utf-8″) as f:
f.writelines(buffer)

完整代码：

import time
from urllib.request import urlopen, Request
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from urllib.error import URLError, HTTPError

# 记录程序开始时间
start_time = time.perf_counter()

def fetch_url(url, headers, collected_links):
try:
# 构建请求对象
req = Request(url, headers=headers)

# 发起请求并读取响应内容
with urlopen(req) as response:
content = response.read()
status_code = response.getcode()

# 使用 BeautifulSoup 解析 HTML
soup = BeautifulSoup(content, “lxml”)

# 提取所有 <a> 标签中的 href 链接
for link in soup.find_all(“a”):
href = link.get(“href”)

# 过滤无效链接
if not href or href.startswith((“#”, “javascript:”, “mailto:”)):
continue

# 拼接完整 URL
full_url = f”{url.rstrip(‘/’)}/{href.lstrip(‘/’)}”
collected_links.append(full_url)

# 打印成功信息
print(f”URL: {url}”)
print(f”Status: {status_code}”)
print(f”描述: {‘成功’ if status_code == 200 else ‘未知’}”)

except HTTPError as e:
print(f”HTTP 错误: URL={url}, 状态码={e.code}, 原因={e.reason}”)
except URLError as e:
print(f”URL 错误: URL={url}, 原因={e.reason}”)
except Exception as e:
print(f”请求失败: {e}”)

return collected_links

def extract_base_url(url):
parsed = urlparse(url)
base_url = f”{parsed.scheme}://{parsed.netloc}”
return base_url

def save_links_to_file(links, file_path, buffer_size=1000):
seen = set()
buffer = []

# 读取已有链接，防止重复写入
try:
with open(file_path, “r”, encoding=”utf-8″) as f:
seen.update(line.strip() for line in f)
except FileNotFoundError:
pass

# 处理每个链接
for link in links:
if link in seen:
continue
seen.add(link)
buffer.append(f”{link}\n”)

# 缓存满则写入文件
if len(buffer) >= buffer_size:
with open(file_path, “a”, encoding=”utf-8″) as f:
f.writelines(buffer)
buffer = []

# 写入剩余链接
if buffer:
with open(file_path, “a”, encoding=”utf-8″) as f:
f.writelines(buffer)

if __name__ == “__main__”:
# 用户输入
input_url = input(“请输入网址：”)
output_path = input(“请输入保存路径（例如：./links.txt ）：”)

# 提取基础 URL
base_url = extract_base_url(input_url)

# 设置请求头
headers = {
“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36”,
“Accept-Language”: “zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6”,
“Accept”: “text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8”,
“Referer”: base_url,
“Host”: base_url[8:] if base_url.startswith(“https://”) else base_url[7:],
}

# 存储所有链接
all_links = [base_url]

# 获取页面链接
fetch_url(base_url, headers, all_links)

# 保存链接到文件
save_links_to_file(all_links, output_path)

# 打印运行时间
end_time = time.perf_counter()
print(f”程序共计运行：{end_time – start_time:.4f}秒”)

欢迎使用66资源网
1. 本站所有资源来源于用户上传和网络，如有侵权请邮件联系站长！
2. 分享目的仅供大家学习和交流，您必须在下载后24小时内删除！
3. 不得使用于非法商业用途，不得违反国家法律。否则后果自负！
4. 本站提供的源码、模板、插件等等其他资源，都不包含技术服务请大家谅解！
5. 如有链接无法下载、失效或广告，请联系管理员处理！
6. 如遇到加密压缩包，请使用WINRAR解压,如遇到无法解压的请联系管理员！
7. 本站有不少源码未能详细测试（解密），不能分辨部分源码是病毒还是误报，所以没有进行任何修改，大家使用前请进行甄别！
66源码网 » Python网络爬取并且写入到文件里面