【大数据】【Spark】书籍推荐统计分析

  金牌会员 | 2025-1-10 12:47:38 | 显示全部楼层 | 阅读模式
打印 上一主题 下一主题

主题 868|帖子 868|积分 2604

  

数据集说明



  • 在这个借助Goodbook网站收集的数据会集,可以获得有关书籍的信息,如作者、页数、评分和其他信息
文件说明

books.csv

  1. bookID,title,authors,average_rating,isbn,isbn13,language_code,  num_pages,ratings_count,text_reviews_count,publication_date,publisher
  2. 1,Harry Potter and the Half-Blood Prince (Harry Potter  #6),J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
  3. 2,Harry Potter and the Order of the Phoenix (Harry Potter  #5),J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
  4. 4,Harry Potter and the Chamber of Secrets (Harry Potter  #2),J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
  5. ...
复制代码

业务需求

(1)统计最受关注的书籍Top 10

(2)统计书籍篇幅Top 10

(3)统计差异出书社出书的书籍数量

(4)统计差异语言的书籍数量

(5)统计最不受关注的高分书籍Top 10(评分4.5以上,评分人数1w以上,批评数200以下)

(6)统计差异年份出书的书籍数量

(7)统计差异作者的书籍的均匀评分

(8)统计在最受关注的书籍Top 1000中,差异出书社出书的书籍数量

(9)统计在最受关注的书籍Top 1000中,差异语言的书籍数量

(10)统计差异作者的书籍的均匀受关注程度


需求实现

数据预处理处罚

  1. # -*- coding: utf-8 -*-
  2. # @Time     : 2024/12/14 0:49
  3. # @Author   : 从心
  4. # @File     : spark_book_recommendation_analysis_preprocess.py
  5. # @Software : PyCharm
  6. import pandas as pd
  7. import numpy as np
  8. df = pd.read_csv('../data/books.csv', on_bad_lines='skip')
  9. df.columns = df.columns.str.strip()
  10. print(df.head(3))
  11. df.info()
  12. df = df.dropna()
  13. df.info()
  14. df = df.drop_duplicates(keep='first')
  15. df.info()
  16. def convert_date(date_str):
  17.     try:
  18.         converted_date = pd.to_datetime(date_str, format='%m/%d/%Y')
  19.         return converted_date.strftime('%Y-%m-%d')
  20.     except ValueError as e:
  21.         print(f"{date_str} 转换失败: {e}")
  22.         return np.nan
  23. df['publication_date'] = df['publication_date'].apply(convert_date)
  24. df = df.dropna()
  25. df.info()
  26. print(df['language_code'].unique())
  27. df.to_csv('../data/books_cleaned.csv', encoding='utf-8', index=False)
复制代码
数据统计分析

  1. # -*- coding: utf-8 -*-
  2. # @Time     : 2024/12/14 0:50
  3. # @Author   : 从心
  4. # @File     : spark_book_recommendation_analysis.py
  5. # @Software : PyCharm
  6. from pyspark import SparkConf
  7. from pyspark.sql import SparkSession
  8. from pyspark.sql.functions import date_format, split, rank
  9. from pyspark.sql.window import Window
  10. spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()
  11. # 视图 books
  12. df_books = spark.read.csv('/input_spark_book_recommendation_analysis/books_cleaned.csv', header=True, inferSchema=True)
  13. df_books.show(10)
  14. df_books.createOrReplaceTempView('books')
  15. """
  16. (1) 统计最受关注的书籍 Top 10
  17. """
  18. df_books_attention_top_10 = spark.sql(
  19.     """
  20.     select bookID, title, text_reviews_count, substring_index(authors, '/', 1) as author_first, average_rating, isbn, isbn13, language_code, num_pages, ratings_count, publication_date, publisher
  21.     from books
  22.     order by text_reviews_count desc
  23.     """
  24. )
  25. df_books_attention_top_10 = df_books_attention_top_10.repartition(1)
  26. df_books_attention_top_10.show(n=10, truncate=False)
  27. df_books_attention_top_10.write.csv('/result/books_attention_top_10.csv',
  28.                                     mode='overwrite')
  29. """
  30. (2) 统计书籍篇幅 Top 10
  31. """
  32. df_books_length_top_10 = spark.sql(
  33.     """
  34.     select bookID, title, num_pages, substring_index(authors, '/', 1) as author_first, average_rating, isbn, isbn13, language_code, ratings_count, text_reviews_count, publication_date, publisher
  35.     from books
  36.     order by num_pages desc
  37.     """
  38. )
  39. df_books_length_top_10 = df_books_length_top_10.repartition(1)
  40. df_books_length_top_10.show(n=10, truncate=False)
  41. df_books_length_top_10.write.csv('/result/books_length_top_10.csv', mode='overwrite')
  42. """
  43. (3) 统计不同出版社出版的书籍数量
  44. """
  45. df_publisher_books_num = spark.sql(
  46.     """
  47.     select publisher, count(*) as books_num
  48.     from books
  49.     group by publisher
  50.     order by books_num desc
  51.     """
  52. )
  53. df_publisher_books_num = df_publisher_books_num.repartition(1)
  54. df_publisher_books_num.show(n=10, truncate=False)
  55. df_publisher_books_num.write.csv('/result/publisher_books_num.csv',
  56.                                  mode='overwrite')
  57. """
  58. (4) 统计不同语言的书籍数量
  59. """
  60. df_language_books_num = spark.sql(
  61.     """
  62.     select language_code, count(*) as books_num
  63.     from books
  64.     group by language_code
  65.     order by books_num desc
  66.     """
  67. )
  68. df_language_books_num = df_language_books_num.repartition(1)
  69. df_language_books_num.show(n=10, truncate=False)
  70. df_language_books_num.write.csv('/result/language_books_num.csv', mode='overwrite')
  71. """
  72. (5) 统计最不受关注的高分书籍 Top 10 (评分 4.5 以上, 评分人数 1w 以上, 评论数 200 以下)
  73. """
  74. df_books_rating_no_attention_top_10 = spark.sql(
  75.     """
  76.     select bookID, title, substring_index(authors, '/', 1) as author_first, average_rating, isbn, isbn13, language_code, num_pages, ratings_count, text_reviews_count, publication_date, publisher
  77.     from books
  78.     where average_rating > 4.5 and ratings_count > 10000 and text_reviews_count < 200
  79.     order by text_reviews_count asc
  80.     """
  81. )
  82. df_books_rating_no_attention_top_10 = df_books_rating_no_attention_top_10.repartition(1)
  83. df_books_rating_no_attention_top_10.show(n=10, truncate=False)
  84. df_books_rating_no_attention_top_10.write.csv(
  85.     '/result/books_rating_no_attention_top_10.csv', mode='overwrite')
  86. # 视图 books_with_year
  87. df_books_with_year = df_books.withColumn('year', date_format(df_books['publication_date'], 'yyyy'))
  88. df_books_with_year.show(10)
  89. df_books_with_year.createOrReplaceTempView('books_with_year')
  90. """
  91. (6) 统计不同年份出版的书籍数量
  92. """
  93. df_year_books_num = spark.sql(
  94.     """
  95.     select year, count(*) as books_num
  96.     from books_with_year
  97.     group by year
  98.     order by year asc
  99.     """
  100. )
  101. df_year_books_num = df_year_books_num.repartition(1)
  102. df_year_books_num.show(n=10, truncate=False)
  103. df_year_books_num.write.csv('/result/year_books_num.csv', mode='overwrite')
  104. # 视图 books_with_author_first
  105. df_books_with_author_first = df_books.withColumn('author_first', split(df_books['authors'], '/').getItem(0))
  106. df_books_with_author_first.show(10)
  107. df_books_with_author_first.createOrReplaceTempView('books_with_author_first')
  108. """
  109. (7) 统计不同作者的书籍的平均评分
  110. """
  111. df_author_books_avg_rating = spark.sql(
  112.     """
  113.     select author_first, sum(average_rating * ratings_count) / sum(ratings_count) as avg_rating, count(*) as books_num
  114.     from books_with_author_first
  115.     group by author_first
  116.     order by avg_rating desc, books_num desc
  117.     """
  118. )
  119. df_author_books_avg_rating = df_author_books_avg_rating.repartition(1)
  120. df_author_books_avg_rating.show(n=10, truncate=False)
  121. df_author_books_avg_rating.write.csv('/result/author_books_avg_rating.csv',
  122.                                      mode='overwrite')
  123. # 视图 books_attention_top_1000
  124. window = Window.orderBy(df_books_with_author_first['text_reviews_count'].desc())
  125. df_books_attention_rank = df_books_with_author_first.withColumn('rank', rank().over(window))
  126. df_books_attention_top_1000 = df_books_attention_rank.filter(df_books_attention_rank['rank'] <= 1000).drop('rank')
  127. df_books_attention_top_1000.show(10)
  128. df_books_attention_top_1000.createOrReplaceTempView('books_attention_top_1000')
  129. """
  130. (8) 统计在最受关注的书籍 Top 1000 中, 不同出版社出版的书籍数量
  131. """
  132. df_publisher_books_top_1000_num = spark.sql(
  133.     """
  134.     select publisher, count(*) as books_num
  135.     from books_attention_top_1000
  136.     group by publisher
  137.     order by books_num desc
  138.     """
  139. )
  140. df_publisher_books_top_1000_num.show(n=10, truncate=False)
  141. df_publisher_books_top_1000_num.write.csv('/result/publisher_books_top_1000_num.csv',
  142.                                           mode='overwrite')
  143. """
  144. (9) 统计在最受关注的书籍 Top 1000 中, 不同语言的书籍数量
  145. """
  146. df_language_books_top_1000_num = spark.sql(
  147.     """
  148.     select language_code, count(*) as books_num
  149.     from books_attention_top_1000
  150.     group by language_code
  151.     order by books_num desc
  152.     """
  153. )
  154. df_language_books_top_1000_num.show(n=10, truncate=False)
  155. df_language_books_top_1000_num.write.csv('/result/language_books_top_1000_num.csv',
  156.                                          mode='overwrite')
  157. """
  158. (10) 统计不同作者的书籍的平均受关注程度
  159. """
  160. df_author_books_avg_attention = spark.sql(
  161.     """
  162.     select author_first, sum(text_reviews_count) / count(*) as avg_attention, count(*) as books_num
  163.     from books_with_author_first
  164.     group by author_first
  165.     order by avg_attention desc, books_num desc
  166.     """
  167. )
  168. df_author_books_avg_attention = df_author_books_avg_attention.repartition(1)
  169. df_author_books_avg_attention.show(n=10, truncate=False)
  170. df_author_books_avg_attention.write.csv('/result/author_books_avg_attention.csv',
  171.                                         mode='overwrite')
复制代码
结果可视化

  1. # -*- coding: utf-8 -*-
  2. # @Time     : 2024/12/14 0:50
  3. # @Author   : 从心
  4. # @File     : spark_book_recommendation_analysis_visualization.py
  5. # @Software : PyCharm
  6. import pandas as pd
  7. import matplotlib.pyplot as plt
  8. plt.rcParams['font.sans-serif'] = ['SimHei']
  9. plt.rcParams['axes.unicode_minus'] = False
  10. import matplotlib.colors as mcolors
  11. def chart_1():
  12.     """
  13.     (1) 统计最受关注的书籍 Top 10
  14.     """
  15.     csv_path = '../result/books_attention_top_10.csv/part-r-00000-d44ddace-d5f4-42df-980d-d3e236064461.csv'
  16.     names = ['bookID', 'title', 'text_reviews_count', 'author_first', 'average_rating', 'isbn', 'isbn13',
  17.              'language_code',
  18.              'num_pages', 'ratings_count', 'publication_date', 'publisher']
  19.     df = pd.read_csv(csv_path, header=None, names=names)
  20.     df = df.head(10)
  21.     df = df.sort_values(by='text_reviews_count', ascending=True)
  22.     plt.figure(figsize=(16, 9))
  23.     y = df['title']
  24.     x = df['text_reviews_count']
  25.     plt.barh(y, x, color='skyblue')
  26.     for idx, value in enumerate(x):
  27.         plt.text(value, idx, f'{value}', va='center', ha='left')
  28.     plt.title('最受关注的书籍 Top 10', fontsize=16, fontweight='bold')
  29.     plt.ylabel('标题', fontsize=16, fontweight='bold')
  30.     plt.xlabel('评论数', fontsize=16, fontweight='bold')
  31.     plt.tight_layout()
  32.     plt.savefig('../visualization/[1]books_attention_top_10.png')
  33.     plt.show()
  34. def chart_2():
  35.     """
  36.     (2) 统计书籍篇幅 Top 10
  37.     """
  38.     csv_path = '../result/books_length_top_10.csv/part-r-00000-7be4f0a3-b317-408f-ba8a-41be340d193b.csv'
  39.     names = ['bookID', 'title', 'num_pages', 'author_first', 'average_rating', 'isbn', 'isbn13', 'language_code',
  40.              'ratings_count', 'text_reviews_count', 'publication_date', 'publisher']
  41.     df = pd.read_csv(csv_path, header=None, names=names)
  42.     df = df.head(10)
  43.     df = df.sort_values(by='num_pages', ascending=True)
  44.     plt.figure(figsize=(16, 9))
  45.     y = df['title']
  46.     x = df['num_pages']
  47.     plt.barh(y, x, color='skyblue')
  48.     for idx, value in enumerate(x):
  49.         plt.text(value, idx, f'{value}', va='center', ha='left')
  50.     plt.title('书籍篇幅 Top 10', fontsize=16, fontweight='bold')
  51.     plt.ylabel('标题', fontsize=16, fontweight='bold')
  52.     plt.xlabel('页数', fontsize=16, fontweight='bold')
  53.     plt.tight_layout()
  54.     plt.savefig('../visualization/[2]books_length_top_10.png')
  55.     plt.show()
  56. def chart_3():
  57.     """
  58.     (3) 统计不同出版社出版的书籍数量
  59.     """
  60.     csv_path = '../result/publisher_books_num.csv/part-r-00000-f627a337-9aed-44ac-a37c-6c03582640dc.csv'
  61.     names = ['publisher', 'books_num']
  62.     df = pd.read_csv(csv_path, header=None, names=names)
  63.     df = df.head(50)
  64.     plt.figure(figsize=(16, 9))
  65.     x = df['publisher']
  66.     y = df['books_num']
  67.     bars = plt.bar(x, y, color='skyblue')
  68.     for bar in bars:
  69.         height = bar.get_height()
  70.         plt.text(bar.get_x() + bar.get_width() / 2.0, height, f'{int(height)}', ha='center', rotation=0)
  71.     plt.title('不同出版社出版的书籍数量', fontsize=16, fontweight='bold')
  72.     plt.xlabel('出版社', fontsize=16, fontweight='bold')
  73.     plt.ylabel('书籍数量', fontsize=16, fontweight='bold')
  74.     plt.xticks(rotation=90)
  75.     plt.tight_layout()
  76.     plt.savefig('../visualization/[3]publisher_books_num.png')
  77.     plt.show()
  78. def chart_4():
  79.     """
  80.     (4) 统计不同语言的书籍数量
  81.     """
  82.     csv_path = '../result/language_books_num.csv/part-r-00000-28c77665-f13b-4bee-aef6-a83511f4213a.csv'
  83.     names = ['language_code', 'books_num']
  84.     df = pd.read_csv(csv_path, header=None, names=names)
  85.     df = df.head(50)
  86.     plt.figure(figsize=(16, 9))
  87.     x = df['language_code']
  88.     y = df['books_num']
  89.     bars = plt.bar(x, y, color='skyblue')
  90.     for bar in bars:
  91.         height = bar.get_height()
  92.         plt.text(bar.get_x() + bar.get_width() / 2.0, height, f'{int(height)}', ha='center', rotation=0)
  93.     plt.title('不同语言的书籍数量', fontsize=16, fontweight='bold')
  94.     plt.xlabel('语言', fontsize=16, fontweight='bold')
  95.     plt.ylabel('书籍数量', fontsize=16, fontweight='bold')
  96.     plt.tight_layout()
  97.     plt.savefig('../visualization/[4]language_books_num.png')
  98.     plt.show()
  99. def chart_5():
  100.     """
  101.     (5) 统计最不受关注的高分书籍 Top 10 (评分 4.5 以上, 评分人数 1w 以上, 评论数 200 以下)
  102.     """
  103.     csv_path = '../result/books_rating_no_attention_top_10.csv/part-r-00000-ec31e0ee-9f5b-4d11-bbd2-5cc0633be073.csv'
  104.     names = ['bookID', 'title', 'author_first', 'average_rating', 'isbn', 'isbn13', 'language_code', 'num_pages',
  105.              'ratings_count', 'text_reviews_count', 'publication_date', 'publisher']
  106.     df = pd.read_csv(csv_path, header=None, names=names)
  107.     df = df.head(10)
  108.     df = df.sort_values(by='average_rating', ascending=True)
  109.     plt.figure(figsize=(16, 9))
  110.     y = df['title']
  111.     x = df['average_rating']
  112.     plt.barh(y, x, color='skyblue')
  113.     for idx, value in enumerate(x):
  114.         plt.text(value, idx, f'{value}', va='center', ha='left')
  115.     plt.title('最不受关注的高分书籍 Top 10', fontsize=16, fontweight='bold')
  116.     plt.ylabel('标题', fontsize=16, fontweight='bold')
  117.     plt.xlabel('平均评分', fontsize=16, fontweight='bold')
  118.     plt.tight_layout()
  119.     plt.savefig('../visualization/[5]books_rating_no_attention_top_10.png')
  120.     plt.show()
  121. def chart_6():
  122.     """
  123.     (6) 统计不同年份出版的书籍数量
  124.     """
  125.     csv_path = '../result/year_books_num.csv/part-r-00000-40448964-0687-400c-94da-d390ca57f2d8.csv'
  126.     names = ['year', 'books_num']
  127.     df = pd.read_csv(csv_path, header=None, names=names)
  128.     plt.figure(figsize=(16, 9))
  129.     x = df['year']
  130.     y = df['books_num']
  131.     plt.plot(x, y, marker='o', linestyle='-', color='skyblue')
  132.     plt.title('不同年份出版的书籍数量', fontsize=16, fontweight='bold')
  133.     plt.xlabel('年份', fontsize=16, fontweight='bold')
  134.     plt.ylabel('书籍数量', fontsize=16, fontweight='bold')
  135.     plt.grid(True, which='both', linestyle='--', linewidth=0.5)
  136.     plt.tight_layout()
  137.     plt.savefig('../visualization/[6]year_books_num.png')
  138.     plt.show()
  139. def chart_7():
  140.     """
  141.     (7) 统计不同作者的书籍的平均评分
  142.     """
  143.     csv_path = '../result/author_books_avg_rating.csv/part-r-00000-5601e037-dc5b-4cd1-ba34-b2207717cb27.csv'
  144.     names = ['author_first', 'avg_rating', 'books_num']
  145.     df = pd.read_csv(csv_path, header=None, names=names)
  146.     df = df.head(50)
  147.     plt.figure(figsize=(16, 9))
  148.     x = df['author_first']
  149.     y = df['avg_rating']
  150.     bars = plt.bar(x, y, color='skyblue')
  151.     for bar in bars:
  152.         height = bar.get_height()
  153.         plt.text(bar.get_x() + bar.get_width() / 2.0, height, f'{height:.2f}', ha='center', rotation=45)
  154.     plt.title('不同作者的书籍的平均评分', fontsize=16, fontweight='bold')
  155.     plt.xlabel('第一作者', fontsize=16, fontweight='bold')
  156.     plt.ylabel('平均评分', fontsize=16, fontweight='bold')
  157.     plt.xticks(rotation=90)
  158.     plt.tight_layout()
  159.     plt.savefig('../visualization/[7]author_books_avg_rating.png')
  160.     plt.show()
  161. def chart_8():
  162.     """
  163.     (8) 统计在最受关注的书籍 Top 1000 中, 不同出版社出版的书籍数量
  164.     """
  165.     csv_path = '../result/publisher_books_top_1000_num.csv/part-r-00000-c9d19260-6251-446a-b55a-2f6864e295f7.csv'
  166.     names = ['publisher', 'books_num']
  167.     df = pd.read_csv(csv_path, header=None, names=names)
  168.     df = df.head(10)
  169.     plt.figure(figsize=(9, 9))
  170.     labels = df['publisher']
  171.     x = df['books_num']
  172.     plt.pie(x, labels=labels, autopct='%1.1f%%', startangle=0)
  173.     plt.title('在最受关注的书籍 Top 1000 中, 不同出版社出版的书籍数量')
  174.     plt.tight_layout()
  175.     plt.savefig('../visualization/[8]publisher_books_top_1000_num.png')
  176.     plt.show()
  177. def chart_9():
  178.     """
  179.     (9) 统计在最受关注的书籍 Top 1000 中, 不同语言的书籍数量
  180.     """
  181.     csv_path = '../result/language_books_top_1000_num.csv/part-r-00000-ecab523d-5d46-4e4f-a5f4-8dc083e049e2.csv'
  182.     names = ['language_code', 'books_num']
  183.     df = pd.read_csv(csv_path, header=None, names=names)
  184.     df = df.head(10)
  185.     plt.figure(figsize=(9, 9))
  186.     labels = df['language_code']
  187.     x = df['books_num']
  188.     wedges = plt.pie(x, labels=None, autopct='%1.1f%%', startangle=0)[0]
  189.     handles = [plt.Rectangle((0, 0), 1, 1, color=mcolors.to_rgba(wedge.get_facecolor())) for wedge in wedges]
  190.     plt.legend(handles, labels, title='语言', loc='upper right', bbox_to_anchor=(0.9, 0.9))
  191.     plt.title('在最受关注的书籍 Top 1000 中, 不同语言的书籍数量')
  192.     plt.tight_layout()
  193.     plt.savefig('../visualization/[9]language_books_top_1000_num.png')
  194.     plt.show()
  195. def chart_10():
  196.     """
  197.     (10) 统计不同作者的书籍的平均受关注程度
  198.     """
  199.     csv_path = '../result/author_books_avg_attention.csv/part-r-00000-b82eadc1-8db8-494d-befa-34bf3834cfb1.csv'
  200.     names = ['author_first', 'avg_attention', 'books_num']
  201.     df = pd.read_csv(csv_path, header=None, names=names)
  202.     df = df.head(50)
  203.     plt.figure(figsize=(16, 9))
  204.     x = df['author_first']
  205.     y = df['avg_attention']
  206.     bars = plt.bar(x, y, color='skyblue')
  207.     for bar in bars:
  208.         height = bar.get_height()
  209.         plt.text(bar.get_x() + bar.get_width() / 2.0, height, f'{int(height)}', ha='center', rotation=45)
  210.     plt.title('不同作者的书籍的平均受关注程度', fontsize=16, fontweight='bold')
  211.     plt.xlabel('第一作者', fontsize=16, fontweight='bold')
  212.     plt.ylabel('平均受关注程度', fontsize=16, fontweight='bold')
  213.     plt.xticks(rotation=90)
  214.     plt.tight_layout()
  215.     plt.savefig('../visualization/[10]author_books_avg_attention.png')
  216.     plt.show()
  217. if __name__ == '__main__':
  218.     for i in range(1, 11):
  219.         eval(f'chart_{i}()')
复制代码
(1)统计最受关注的书籍Top 10


(2)统计书籍篇幅Top 10


(3)统计差异出书社出书的书籍数量


(4)统计差异语言的书籍数量


(5)统计最不受关注的高分书籍Top 10(评分4.5以上,评分人数1w以上,批评数200以下)


(6)统计差异年份出书的书籍数量


(7)统计差异作者的书籍的均匀评分


(8)统计在最受关注的书籍Top 1000中,差异出书社出书的书籍数量


(9)统计在最受关注的书籍Top 1000中,差异语言的书籍数量


(10)统计差异作者的书籍的均匀受关注程度




免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。

本帖子中包含更多资源

您需要 登录 才可以下载或查看,没有账号?立即注册

x
回复

使用道具 举报

0 个回复

倒序浏览

快速回复

您需要登录后才可以回帖 登录 or 立即注册

本版积分规则

金牌会员
这个人很懒什么都没写!
快速回复 返回顶部 返回列表