class xiecheng_SeleniumMiddleware: def __init__(self): self.driver = creat_browser() self.winflag = 0 # 释放资源 def closemidd(self,request): if request.meta.get('closeid')==request.meta.get('id'): self.driver.quit() def process_request(self, request, spider): if request.meta.get('use_selenium'): self.driver.get(request.url) # 在这里使用Selenium进行页面交互,如点击按钮、填写表单等 # 并期待页面加载完成 # 获取页面内容 # page_source = self.driver.page_source # 转换为字节格式,以制止一些编码错误 # self.driver.implicitly_wait(5) # 设置隐式期待时间为5秒 try: # 显示期待确保能找到元素,显示期待3s # raise IgnoreRequest("强制取消") elements = WebDriverWait(self.driver, 3).until( lambda x: x.find_elements(by=By.CSS_SELECTOR, value='.guide-main-item-bottom .title')) Similarity_score = [] for element in elements: title = element.text oldtitle = request.url.split('=')[1] # url 转码中文 oldtitle = urllib.parse.unquote(oldtitle) Similarity_score.append(get_similarity(oldtitle, title)) # if Similarity_score[-1][4] >=50: # print(Similarity_score[-1]) max_score = None max_index = None if Similarity_score!=[]: for index, score in enumerate(Similarity_score): if max_score == None or max_score[-1] < score[-1]: max_score = score max_index = index # 找到最匹配的选项 # print('max', max_score) # print(max_index) # 若成功找到最匹配项,且各种匹配方式得分都大于50.点击该景点获取url if max_score != None and max_score[2] >= 50 and max_score[3] >= 50 and max_score[4] >= 50: print('max', max_score) elements[max_index].click() print("click yes") # self.winflag+=1 # thiswim=self.winflag li = self.driver.window_handles # 出现多个窗口,需要切换句柄,先获取句柄列表 if len(li)>=2: self.driver.switch_to.window(li[-1]) # 切换句柄 # 显示期待热度数据,期待详情页显示完毕 hot = WebDriverWait(self.driver, 3).until( lambda x: x.find_elements(by=By.CSS_SELECTOR, value='.heatView .heatScoreView .heatScoreText')) # 将详情页信息发送到spider body = to_bytes(self.driver.page_source, encoding='utf-8') print('传入爬虫url') print(self.driver.current_url) # 修改中间件判断参数 request.meta['use_selenium'] = False response = HtmlResponse(url=self.driver.current_url, body=body, encoding='utf-8', request=request) # 关闭窗口句柄减一 self.driver.close() # 切换至搜索页面窗口 if len(li) >= 1: self.driver.switch_to.window(li[0]) # self.winflag-=1 self.closemidd(request) return response else: self.closemidd(request) raise IgnoreRequest("未找到相似度合格的元素") except Exception as e: raise IgnoreRequest("中间件报错,或可能是显示期待的元素期待超时或是元素不存在。") spider.logger.error(f"Error: 中间件报错,{e}") # return None else: print('未进入携程的中间件,被转移') # 不使用 Selenium,直接返回 None,让 Scrapy 使用默认的下载器处理这个请求 # pass return None |
列名 | 数据范例 | 长度 | 主外键 | 注释 |
id | int | 11 | 是 | 编号 |
username | varchar | 45 | | 名称 |
email | varchar | 45 | | 邮箱 |
password | varchar | 45 | | 暗码 |
phone | varchar | 45 | | 电话 |
address | varchar | 45 | | 地点 |
列名 | 数据范例 | 长度 | 主外键 | 注释 |
id | int | 11 | 是 | 景点ID |
Name | varchar | 1 | | 景点名称 |
Type | varchar | 11 | | 范例 |
tag | varchar | 45 | | 标签 |
area | varchar | 45 | | 地区 |
Describe | varchar | 300 | | 描述信息 |
列名 | 数据范例 | 长度 | 主外键 | 注释 |
id | int | 11 | 是 | 城市ID |
name | varchar | 50 | | 城市名称 |
Tag | varchar | 50 | | 标签 |
Describe | varchar | 200 | | 描述 |
hot | char | 20 | | 城市热度 |
times | int | 11 | | 点击次数 |
列名 | 数据范例 | 长度 | 主外键 | 注释 |
URL | varchar | 50 | 是 | URL |
Start-lew | varchar | 50 | | 星级 |
Score | int | 11 | | 评分 |
Content | varchar | 200 | | 内容 |
Viewp-id | varchar | 50 | | 景点ID |
User-id | varchar | 50 | | 用户ID |
class Case_item(models.Model): shiqu = models.CharField(verbose_name='城市',max_length=124) title = models.CharField(verbose_name='景点',max_length=124) details = models.CharField(verbose_name='详情页链接',max_length=424) pingfen = models.FloatField(verbose_name='评分') city = models.CharField(verbose_name='详细地点',max_length=424) heatdegree = models.FloatField(verbose_name='热度') img = models.CharField(verbose_name='图片链接',max_length=424) content = models.TextField(verbose_name='简介') openinghours = models.CharField(verbose_name='开放时间',max_length=424) count = models.FloatField(verbose_name='评论数量') def __str__(self): return self.title class Meta: verbose_name = u"数据表" verbose_name_plural = verbose_name class PingLun(models.Model): case_item_id = models.ForeignKey(Case_item,on_delete=models.CASCADE) text = models.TextField(verbose_name='评论内容') class Meta: verbose_name = u"评论表" verbose_name_plural = verbose_name class DianZhan(models.Model): case_item_id = models.ForeignKey(Case_item,on_delete=models.CASCADE) user_id = models.ForeignKey(Users,on_delete=models.CASCADE) class Meta: verbose_name = u"点赞表" verbose_name_plural = verbose_name class JingDian(models.Model): name = models.CharField(verbose_name='景区名称', max_length=124) auth = models.CharField(verbose_name='评论者', max_length=124) pingfen = models.CharField(verbose_name='评分', max_length=124) text = models.TextField(verbose_name='评价文字') shijian = models.CharField(verbose_name='时间', max_length=124) laiyuan = models.CharField(verbose_name='数据来源', max_length=124) class Meta: verbose_name = u"景点表" verbose_name_plural = verbose_name class SC_food(models.Model): 景区id = models.CharField(verbose_name='景区id', max_length=524) 景区名称 = models.CharField(verbose_name='景区名称', max_length=524) 地区 = models.CharField(verbose_name='地区', max_length=524) 评分 = models.CharField(verbose_name='评分', max_length=524) 范例 = models.CharField(verbose_name='范例', max_length=524) 景区url = models.CharField(verbose_name='景区url', max_length=524) 景区imgUrl = models.CharField(verbose_name='景区imgUrl', max_length=524) 评论条数 = models.CharField(verbose_name='评论条数', max_length=524) 数据来源 = models.CharField(verbose_name='数据来源', max_length=524) |
import pandas as pdfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.metrics.pairwise import linear_kernel # 假设有一个包罗景点信息的数据集,包括景点名称和描述 data = { '景点名称': ['景点A', '景点B', '景点C'], '描述': ['景点A的描述', '景点B的描述', '景点C的描述'] } df = pd.DataFrame(data) # 用户的偏好关键词,可以通过用户行为或问卷调查获取 user_preference = "山水风光" # 使用TF-IDF向量化景点描述 tfidf = TfidfVectorizer(stop_words='chinese') tfidf_matrix = tfidf.fit_transform(df['描述']) # 计算景点之间的相似度 cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) # 根据用户偏好计算推荐def recommend_places(user_preference, df, cosine_sim): idx = df.index[df['景点名称'] == user_preference][0] sim_scores = list(enumerate(cosine_sim[idx])) sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) sim_scores = sim_scores[1:4] # 推荐3个最相关的景点 place_indices = [i[0] for i in sim_scores] return df['景点名称'].iloc[place_indices] recommendations = recommend_places(user_preference, df, cosine_sim)print("推荐的景点:")for place in recommendations: print(place) |
import pandas as pdimport matplotlib.pyplot as pltimport seaborn as sns # 读取旅游景点数据集 df = pd.read_csv('tourist_spots_data.csv') # 查看数据集的根本信息print(df.info()) # 数据洗濯:处理缺失值和非常值 df.dropna(subset=['评分'], inplace=True) df = df[df['评分'] >= 0] # 假设评分应该大于即是0 # 探索性数据分析# 统计评分分布 plt.figure(figsize=(10, 6)) sns.histplot(df['评分'], bins=10, kde=True) plt.title('评分分布') plt.xlabel('评分') plt.ylabel('频数') plt.show() # 按地区统计景点数量 spots_by_region = df['地区'].value_counts()print(spots_by_region) # 计算平均评分最高的前5个景点 top_rated_spots = df.groupby('景点名称')['评分'].mean().sort_values(ascending=False).head(5)print(top_rated_spots) # 可视化不同地区景点数量 plt.figure(figsize=(12, 6)) sns.barplot(x=spots_by_region.index, y=spots_by_region.values) plt.title('不同地区景点数量') plt.xlabel('地区') plt.ylabel('景点数量') plt.xticks(rotation=45) plt.show() |
欢迎光临 ToB企服应用市场:ToB评测及商务社交产业平台 (https://dis.qidao123.com/) | Powered by Discuz! X3.4 |