主题

python
import streamlit as st
import requests
import time,os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
st.subheader('采集网页')
driver = None
save_path = '采集结果'
if not os.path.exists(save_path):
os.makedirs(save_path)
chrome_user_data_dir = '/Users/wangxiaomin/my_code/patrick-notes/0Scripts/streamlit_demo/chrome_user_data'
if not os.path.exists(chrome_user_data_dir):
os.makedirs(chrome_user_data_dir)
def get_aricle_content(driver):
texts = []
for item in driver.find_elements(By.TAG_NAME,'p'):
line = item.text
if not line in texts:
texts.append(line)
st.write(line)
for item in driver.find_elements(By.TAG_NAME,'span'):
line = item.text
if not line in texts:
texts.append(line)
st.write(line)
return texts
if "driver" not in st.session_state:
st.session_state.driver = None
def start_driver():
# driver_path = "/Users/wangxiaomin/Documents/selenium_driver/chromedriver_106" # 替换为你的实际路径
driver_path = "/Users/wangxiaomin/Documents/selenium_driver/chromedriver_135" # 替换为你的实际路径
chrome_options = Options()
debugger_address = "127.0.0.1:9222"
chrome_options.add_experimental_option("debuggerAddress", debugger_address)
service = Service(executable_path=driver_path)
st.session_state.driver = webdriver.Chrome(service=service, options=chrome_options)
if st.button('启动'):
start_driver()
if st.session_state.driver is not None:
pass
if "driver" in st.session_state and st.session_state.driver and st.session_state.driver.title:
st.success("启动成功")
link = st.text_input('请输入链接')
if st.button('采集文章'):
st.session_state.driver.get(link)
st.write(st.session_state.driver.title)
with st.spinner('采集中...'):
get_aricle_content(st.session_state.driver)
st.write('采集完成')
if st.button('关闭'):
driver.close()