Python网络爬取并且写入到文件里面
核心代码:
代码段1:
# 提取所有 <a> 标签中的 href 链接
for link in soup.find_all(“a”):
href = link.get(“href”)
# 过滤无效链接
if not href or href.startswith((“#”, “javascript:”, “mailto:”)):
continue
# 拼接完整 URL
full_url = f”{url.rstrip(‘/’)}/{href.lstrip(‘/’)}”
collected_links.append(full_url)
代码段2:
def save_links_to_file(links, file_path, buffer_size=1000):
seen = set()
buffer = []
# 读取已有链接,防止重复写入
try:
with open(file_path, “r”, encoding=”utf-8″) as f:
seen.update(line.strip() for line in f)
except FileNotFoundError:
pass
# 处理每个链接
for link in links:
if link in seen:
continue
seen.add(link)
buffer.append(f”{link}\n”)
# 缓存满则写入文件
if len(buffer) >= buffer_size:
with open(file_path, “a”, encoding=”utf-8″) as f:
f.writelines(buffer)
buffer = []
# 写入剩余链接
if buffer:
with open(file_path, “a”, encoding=”utf-8″) as f:
f.writelines(buffer)
完整代码:
import time
from urllib.request import urlopen, Request
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from urllib.error import URLError, HTTPError
# 记录程序开始时间
start_time = time.perf_counter()
def fetch_url(url, headers, collected_links):
try:
# 构建请求对象
req = Request(url, headers=headers)
# 发起请求并读取响应内容
with urlopen(req) as response:
content = response.read()
status_code = response.getcode()
# 使用 BeautifulSoup 解析 HTML
soup = BeautifulSoup(content, “lxml”)
# 提取所有 <a> 标签中的 href 链接
for link in soup.find_all(“a”):
href = link.get(“href”)
# 过滤无效链接
if not href or href.startswith((“#”, “javascript:”, “mailto:”)):
continue
# 拼接完整 URL
full_url = f”{url.rstrip(‘/’)}/{href.lstrip(‘/’)}”
collected_links.append(full_url)
# 打印成功信息
print(f”URL: {url}”)
print(f”Status: {status_code}”)
print(f”描述: {‘成功’ if status_code == 200 else ‘未知’}”)
except HTTPError as e:
print(f”HTTP 错误: URL={url}, 状态码={e.code}, 原因={e.reason}”)
except URLError as e:
print(f”URL 错误: URL={url}, 原因={e.reason}”)
except Exception as e:
print(f”请求失败: {e}”)
return collected_links
def extract_base_url(url):
parsed = urlparse(url)
base_url = f”{parsed.scheme}://{parsed.netloc}”
return base_url
def save_links_to_file(links, file_path, buffer_size=1000):
seen = set()
buffer = []
# 读取已有链接,防止重复写入
try:
with open(file_path, “r”, encoding=”utf-8″) as f:
seen.update(line.strip() for line in f)
except FileNotFoundError:
pass
# 处理每个链接
for link in links:
if link in seen:
continue
seen.add(link)
buffer.append(f”{link}\n”)
# 缓存满则写入文件
if len(buffer) >= buffer_size:
with open(file_path, “a”, encoding=”utf-8″) as f:
f.writelines(buffer)
buffer = []
# 写入剩余链接
if buffer:
with open(file_path, “a”, encoding=”utf-8″) as f:
f.writelines(buffer)
if __name__ == “__main__”:
# 用户输入
input_url = input(“请输入网址:”)
output_path = input(“请输入保存路径(例如:./links.txt ):”)
# 提取基础 URL
base_url = extract_base_url(input_url)
# 设置请求头
headers = {
“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36”,
“Accept-Language”: “zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6”,
“Accept”: “text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8”,
“Referer”: base_url,
“Host”: base_url[8:] if base_url.startswith(“https://”) else base_url[7:],
}
# 存储所有链接
all_links = [base_url]
# 获取页面链接
fetch_url(base_url, headers, all_links)
# 保存链接到文件
save_links_to_file(all_links, output_path)
# 打印运行时间
end_time = time.perf_counter()
print(f”程序共计运行:{end_time – start_time:.4f}秒”)
1. 本站所有资源来源于用户上传和网络,如有侵权请邮件联系站长!
2. 分享目的仅供大家学习和交流,您必须在下载后24小时内删除!
3. 不得使用于非法商业用途,不得违反国家法律。否则后果自负!
4. 本站提供的源码、模板、插件等等其他资源,都不包含技术服务请大家谅解!
5. 如有链接无法下载、失效或广告,请联系管理员处理!
6. 如遇到加密压缩包,请使用WINRAR解压,如遇到无法解压的请联系管理员!
7. 本站有不少源码未能详细测试(解密),不能分辨部分源码是病毒还是误报,所以没有进行任何修改,大家使用前请进行甄别!
66源码网 » Python网络爬取并且写入到文件里面