用Python完成内容,最终上交:①专业文章的原文文件②剔除用途的标点符号文件③剔除词文件④掩码图片文件⑤py代码文件⑥词云图图片
这些打成压缩包
这是之前写的demo , 你可以参考下, 给个采纳不迷路谢谢
效果图
from selenium import webdriver
from selenium.webdriver.common.by import By
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
import os
browser = webdriver.Chrome()
url = 'https://movie.douban.com/subject/34841067/comments?status=P'
browser.get(url)
selector = 'span.short'
results = []
while True:
elements = browser.find_elements(By.CSS_SELECTOR, selector)
for element in elements:
comment = element.text.strip()
if len(comment) > 0:
results.append(comment)
# 点击下一页按钮
try:
next_btn = browser.find_element(By.LINK_TEXT, '后页>')
next_btn.click()
except:
break
# 关闭浏览器对象
browser.quit()
text = '\n'.join(results)
# 使用 jieba 分词库进行中文分词
words = jieba.cut(text)
stop_words_path = 'stopwords.txt'
with open(stop_words_path, encoding='utf-8') as f:
stop_words = f.read().splitlines()
valid_words = [word for word in words if word not in stop_words]
valid_text = ' '.join(valid_words)
wc = WordCloud(
font_path="PingFang Bold.ttf",
width=800,
height=600,
background_color='white',
max_words=200,
max_font_size=80,
random_state=42
)
wc.generate(valid_text)
# 定义输出路径
output_dir = 'output'
output_path = os.path.join(output_dir, 'wordcloud.png')
if not os.path.exists(output_dir):
os.makedirs(output_dir)
wc.to_file(output_path)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()
# -*- coding: utf-8 -*-
import numpy as np
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
data=pd.read_excel('dy01.xlsx',encoding='ute-8')
y=data['box-office']
x1=data['release-time']
x2=data['score']
x3=data['score-num']
x4=data['length']
x5=data['name']
x6=data['h1']
x7=data['h2']
x8=data['h3']
x=np.column_stack((x2,x3,x4,x6,x7,x8))
# 线性回归拟合
x_n = sm.add_constant(x) #statsmodels进行回归时,一定要添加此常数项
model = sm.OLS(y, x_n) #model是回归分析模型
results = model.fit() #results是回归分析后的结果
#输出回归分析的结果
print(results.summary())
print('Parameters: ', results.params)
print('R2: ', results.rsquared)
#以下用于出图
plt.figure()
plt.rcParams['font.sans-serif'] = ['Kaiti'] # 指定默认字体
plt.title(u"线性回归预测")
plt.xlabel(u"x")
plt.ylabel(u"price")
plt.axis([0, 3000000, 0, 5000000000])
plt.scatter(x, y, marker="o",color="b", s=50)
plt.plot(x_n, y, linewidth=3, color="r")
plt.show()
我需要通过Python从网站上寻找一篇药学背景的文章,并对这篇文章进行词云分析。需要剔除标点符号和一些无意义的词汇,提供掩码图片,并生成词云分析结果和词频统计结果的文件。最后需要将所有文件打包成zip压缩包。
解决方案:
import requests
from bs4 import BeautifulSoup
# 请求网页
url = "http://www.example.com" # 替换成实际的网站链接
response = requests.get(url)
# 解析文章内容
soup = BeautifulSoup(response.text, 'html.parser')
article = soup.find('div', {'class': 'article-content'}).get_text()
import jieba
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# 中文分词和清洗数据
words = jieba.cut(article)
stopwords = set(['的', '和', '是', '在', '了', '将', '等', '着', '我', '你', '他']) # 根据实际情况添加停用词
words_cleaned = []
for word in words:
if word not in stopwords:
words_cleaned.append(word)
# 生成掩码图片
mask = np.array(Image.open('mask.png')) # 请换成实际的掩码图片路径
plt.imshow(mask)
plt.axis('off')
plt.show()
from collections import Counter
# 生成词云
text = ' '.join(words_cleaned)
wordcloud = WordCloud(mask=mask, background_color="white", max_words=2000).generate(text)
# 保存词云分析结果
wordcloud.to_file("wordcloud.png")
# 统计词频
counter = Counter(words_cleaned)
most_common = counter.most_common(30) # 取前30个最频繁出现的词语
with open("word_frequency.txt", "w") as f:
f.write("Word,Frequency\n")
for word, freq in most_common:
f.write(word + "," + str(freq) + "\n")
import zipfile
# 打包所有文件
with zipfile.ZipFile("results.zip", "w") as zip:
zip.write("wordcloud.png")
zip.write("word_frequency.txt")