数据集说明
- 在这个借助Goodbook网站收集的数据会集,可以获得有关书籍的信息,如作者、页数、评分和其他信息
文件说明
books.csv
- bookID,title,authors,average_rating,isbn,isbn13,language_code, num_pages,ratings_count,text_reviews_count,publication_date,publisher
- 1,Harry Potter and the Half-Blood Prince (Harry Potter #6),J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
- 2,Harry Potter and the Order of the Phoenix (Harry Potter #5),J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
- 4,Harry Potter and the Chamber of Secrets (Harry Potter #2),J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
- ...
复制代码 业务需求
(1)统计最受关注的书籍Top 10
(2)统计书籍篇幅Top 10
(3)统计差异出书社出书的书籍数量
(4)统计差异语言的书籍数量
(5)统计最不受关注的高分书籍Top 10(评分4.5以上,评分人数1w以上,批评数200以下)
(6)统计差异年份出书的书籍数量
(7)统计差异作者的书籍的均匀评分
(8)统计在最受关注的书籍Top 1000中,差异出书社出书的书籍数量
(9)统计在最受关注的书籍Top 1000中,差异语言的书籍数量
(10)统计差异作者的书籍的均匀受关注程度
需求实现
数据预处理处罚
- # -*- coding: utf-8 -*-
- # @Time : 2024/12/14 0:49
- # @Author : 从心
- # @File : spark_book_recommendation_analysis_preprocess.py
- # @Software : PyCharm
- import pandas as pd
- import numpy as np
- df = pd.read_csv('../data/books.csv', on_bad_lines='skip')
- df.columns = df.columns.str.strip()
- print(df.head(3))
- df.info()
- df = df.dropna()
- df.info()
- df = df.drop_duplicates(keep='first')
- df.info()
- def convert_date(date_str):
- try:
- converted_date = pd.to_datetime(date_str, format='%m/%d/%Y')
- return converted_date.strftime('%Y-%m-%d')
- except ValueError as e:
- print(f"{date_str} 转换失败: {e}")
- return np.nan
- df['publication_date'] = df['publication_date'].apply(convert_date)
- df = df.dropna()
- df.info()
- print(df['language_code'].unique())
- df.to_csv('../data/books_cleaned.csv', encoding='utf-8', index=False)
复制代码 数据统计分析
- # -*- coding: utf-8 -*-
- # @Time : 2024/12/14 0:50
- # @Author : 从心
- # @File : spark_book_recommendation_analysis.py
- # @Software : PyCharm
- from pyspark import SparkConf
- from pyspark.sql import SparkSession
- from pyspark.sql.functions import date_format, split, rank
- from pyspark.sql.window import Window
- spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()
- # 视图 books
- df_books = spark.read.csv('/input_spark_book_recommendation_analysis/books_cleaned.csv', header=True, inferSchema=True)
- df_books.show(10)
- df_books.createOrReplaceTempView('books')
- """
- (1) 统计最受关注的书籍 Top 10
- """
- df_books_attention_top_10 = spark.sql(
- """
- select bookID, title, text_reviews_count, substring_index(authors, '/', 1) as author_first, average_rating, isbn, isbn13, language_code, num_pages, ratings_count, publication_date, publisher
- from books
- order by text_reviews_count desc
- """
- )
- df_books_attention_top_10 = df_books_attention_top_10.repartition(1)
- df_books_attention_top_10.show(n=10, truncate=False)
- df_books_attention_top_10.write.csv('/result/books_attention_top_10.csv',
- mode='overwrite')
- """
- (2) 统计书籍篇幅 Top 10
- """
- df_books_length_top_10 = spark.sql(
- """
- select bookID, title, num_pages, substring_index(authors, '/', 1) as author_first, average_rating, isbn, isbn13, language_code, ratings_count, text_reviews_count, publication_date, publisher
- from books
- order by num_pages desc
- """
- )
- df_books_length_top_10 = df_books_length_top_10.repartition(1)
- df_books_length_top_10.show(n=10, truncate=False)
- df_books_length_top_10.write.csv('/result/books_length_top_10.csv', mode='overwrite')
- """
- (3) 统计不同出版社出版的书籍数量
- """
- df_publisher_books_num = spark.sql(
- """
- select publisher, count(*) as books_num
- from books
- group by publisher
- order by books_num desc
- """
- )
- df_publisher_books_num = df_publisher_books_num.repartition(1)
- df_publisher_books_num.show(n=10, truncate=False)
- df_publisher_books_num.write.csv('/result/publisher_books_num.csv',
- mode='overwrite')
- """
- (4) 统计不同语言的书籍数量
- """
- df_language_books_num = spark.sql(
- """
- select language_code, count(*) as books_num
- from books
- group by language_code
- order by books_num desc
- """
- )
- df_language_books_num = df_language_books_num.repartition(1)
- df_language_books_num.show(n=10, truncate=False)
- df_language_books_num.write.csv('/result/language_books_num.csv', mode='overwrite')
- """
- (5) 统计最不受关注的高分书籍 Top 10 (评分 4.5 以上, 评分人数 1w 以上, 评论数 200 以下)
- """
- df_books_rating_no_attention_top_10 = spark.sql(
- """
- select bookID, title, substring_index(authors, '/', 1) as author_first, average_rating, isbn, isbn13, language_code, num_pages, ratings_count, text_reviews_count, publication_date, publisher
- from books
- where average_rating > 4.5 and ratings_count > 10000 and text_reviews_count < 200
- order by text_reviews_count asc
- """
- )
- df_books_rating_no_attention_top_10 = df_books_rating_no_attention_top_10.repartition(1)
- df_books_rating_no_attention_top_10.show(n=10, truncate=False)
- df_books_rating_no_attention_top_10.write.csv(
- '/result/books_rating_no_attention_top_10.csv', mode='overwrite')
- # 视图 books_with_year
- df_books_with_year = df_books.withColumn('year', date_format(df_books['publication_date'], 'yyyy'))
- df_books_with_year.show(10)
- df_books_with_year.createOrReplaceTempView('books_with_year')
- """
- (6) 统计不同年份出版的书籍数量
- """
- df_year_books_num = spark.sql(
- """
- select year, count(*) as books_num
- from books_with_year
- group by year
- order by year asc
- """
- )
- df_year_books_num = df_year_books_num.repartition(1)
- df_year_books_num.show(n=10, truncate=False)
- df_year_books_num.write.csv('/result/year_books_num.csv', mode='overwrite')
- # 视图 books_with_author_first
- df_books_with_author_first = df_books.withColumn('author_first', split(df_books['authors'], '/').getItem(0))
- df_books_with_author_first.show(10)
- df_books_with_author_first.createOrReplaceTempView('books_with_author_first')
- """
- (7) 统计不同作者的书籍的平均评分
- """
- df_author_books_avg_rating = spark.sql(
- """
- select author_first, sum(average_rating * ratings_count) / sum(ratings_count) as avg_rating, count(*) as books_num
- from books_with_author_first
- group by author_first
- order by avg_rating desc, books_num desc
- """
- )
- df_author_books_avg_rating = df_author_books_avg_rating.repartition(1)
- df_author_books_avg_rating.show(n=10, truncate=False)
- df_author_books_avg_rating.write.csv('/result/author_books_avg_rating.csv',
- mode='overwrite')
- # 视图 books_attention_top_1000
- window = Window.orderBy(df_books_with_author_first['text_reviews_count'].desc())
- df_books_attention_rank = df_books_with_author_first.withColumn('rank', rank().over(window))
- df_books_attention_top_1000 = df_books_attention_rank.filter(df_books_attention_rank['rank'] <= 1000).drop('rank')
- df_books_attention_top_1000.show(10)
- df_books_attention_top_1000.createOrReplaceTempView('books_attention_top_1000')
- """
- (8) 统计在最受关注的书籍 Top 1000 中, 不同出版社出版的书籍数量
- """
- df_publisher_books_top_1000_num = spark.sql(
- """
- select publisher, count(*) as books_num
- from books_attention_top_1000
- group by publisher
- order by books_num desc
- """
- )
- df_publisher_books_top_1000_num.show(n=10, truncate=False)
- df_publisher_books_top_1000_num.write.csv('/result/publisher_books_top_1000_num.csv',
- mode='overwrite')
- """
- (9) 统计在最受关注的书籍 Top 1000 中, 不同语言的书籍数量
- """
- df_language_books_top_1000_num = spark.sql(
- """
- select language_code, count(*) as books_num
- from books_attention_top_1000
- group by language_code
- order by books_num desc
- """
- )
- df_language_books_top_1000_num.show(n=10, truncate=False)
- df_language_books_top_1000_num.write.csv('/result/language_books_top_1000_num.csv',
- mode='overwrite')
- """
- (10) 统计不同作者的书籍的平均受关注程度
- """
- df_author_books_avg_attention = spark.sql(
- """
- select author_first, sum(text_reviews_count) / count(*) as avg_attention, count(*) as books_num
- from books_with_author_first
- group by author_first
- order by avg_attention desc, books_num desc
- """
- )
- df_author_books_avg_attention = df_author_books_avg_attention.repartition(1)
- df_author_books_avg_attention.show(n=10, truncate=False)
- df_author_books_avg_attention.write.csv('/result/author_books_avg_attention.csv',
- mode='overwrite')
复制代码 结果可视化
- # -*- coding: utf-8 -*-
- # @Time : 2024/12/14 0:50
- # @Author : 从心
- # @File : spark_book_recommendation_analysis_visualization.py
- # @Software : PyCharm
- import pandas as pd
- import matplotlib.pyplot as plt
- plt.rcParams['font.sans-serif'] = ['SimHei']
- plt.rcParams['axes.unicode_minus'] = False
- import matplotlib.colors as mcolors
- def chart_1():
- """
- (1) 统计最受关注的书籍 Top 10
- """
- csv_path = '../result/books_attention_top_10.csv/part-r-00000-d44ddace-d5f4-42df-980d-d3e236064461.csv'
- names = ['bookID', 'title', 'text_reviews_count', 'author_first', 'average_rating', 'isbn', 'isbn13',
- 'language_code',
- 'num_pages', 'ratings_count', 'publication_date', 'publisher']
- df = pd.read_csv(csv_path, header=None, names=names)
- df = df.head(10)
- df = df.sort_values(by='text_reviews_count', ascending=True)
- plt.figure(figsize=(16, 9))
- y = df['title']
- x = df['text_reviews_count']
- plt.barh(y, x, color='skyblue')
- for idx, value in enumerate(x):
- plt.text(value, idx, f'{value}', va='center', ha='left')
- plt.title('最受关注的书籍 Top 10', fontsize=16, fontweight='bold')
- plt.ylabel('标题', fontsize=16, fontweight='bold')
- plt.xlabel('评论数', fontsize=16, fontweight='bold')
- plt.tight_layout()
- plt.savefig('../visualization/[1]books_attention_top_10.png')
- plt.show()
- def chart_2():
- """
- (2) 统计书籍篇幅 Top 10
- """
- csv_path = '../result/books_length_top_10.csv/part-r-00000-7be4f0a3-b317-408f-ba8a-41be340d193b.csv'
- names = ['bookID', 'title', 'num_pages', 'author_first', 'average_rating', 'isbn', 'isbn13', 'language_code',
- 'ratings_count', 'text_reviews_count', 'publication_date', 'publisher']
- df = pd.read_csv(csv_path, header=None, names=names)
- df = df.head(10)
- df = df.sort_values(by='num_pages', ascending=True)
- plt.figure(figsize=(16, 9))
- y = df['title']
- x = df['num_pages']
- plt.barh(y, x, color='skyblue')
- for idx, value in enumerate(x):
- plt.text(value, idx, f'{value}', va='center', ha='left')
- plt.title('书籍篇幅 Top 10', fontsize=16, fontweight='bold')
- plt.ylabel('标题', fontsize=16, fontweight='bold')
- plt.xlabel('页数', fontsize=16, fontweight='bold')
- plt.tight_layout()
- plt.savefig('../visualization/[2]books_length_top_10.png')
- plt.show()
- def chart_3():
- """
- (3) 统计不同出版社出版的书籍数量
- """
- csv_path = '../result/publisher_books_num.csv/part-r-00000-f627a337-9aed-44ac-a37c-6c03582640dc.csv'
- names = ['publisher', 'books_num']
- df = pd.read_csv(csv_path, header=None, names=names)
- df = df.head(50)
- plt.figure(figsize=(16, 9))
- x = df['publisher']
- y = df['books_num']
- bars = plt.bar(x, y, color='skyblue')
- for bar in bars:
- height = bar.get_height()
- plt.text(bar.get_x() + bar.get_width() / 2.0, height, f'{int(height)}', ha='center', rotation=0)
- plt.title('不同出版社出版的书籍数量', fontsize=16, fontweight='bold')
- plt.xlabel('出版社', fontsize=16, fontweight='bold')
- plt.ylabel('书籍数量', fontsize=16, fontweight='bold')
- plt.xticks(rotation=90)
- plt.tight_layout()
- plt.savefig('../visualization/[3]publisher_books_num.png')
- plt.show()
- def chart_4():
- """
- (4) 统计不同语言的书籍数量
- """
- csv_path = '../result/language_books_num.csv/part-r-00000-28c77665-f13b-4bee-aef6-a83511f4213a.csv'
- names = ['language_code', 'books_num']
- df = pd.read_csv(csv_path, header=None, names=names)
- df = df.head(50)
- plt.figure(figsize=(16, 9))
- x = df['language_code']
- y = df['books_num']
- bars = plt.bar(x, y, color='skyblue')
- for bar in bars:
- height = bar.get_height()
- plt.text(bar.get_x() + bar.get_width() / 2.0, height, f'{int(height)}', ha='center', rotation=0)
- plt.title('不同语言的书籍数量', fontsize=16, fontweight='bold')
- plt.xlabel('语言', fontsize=16, fontweight='bold')
- plt.ylabel('书籍数量', fontsize=16, fontweight='bold')
- plt.tight_layout()
- plt.savefig('../visualization/[4]language_books_num.png')
- plt.show()
- def chart_5():
- """
- (5) 统计最不受关注的高分书籍 Top 10 (评分 4.5 以上, 评分人数 1w 以上, 评论数 200 以下)
- """
- csv_path = '../result/books_rating_no_attention_top_10.csv/part-r-00000-ec31e0ee-9f5b-4d11-bbd2-5cc0633be073.csv'
- names = ['bookID', 'title', 'author_first', 'average_rating', 'isbn', 'isbn13', 'language_code', 'num_pages',
- 'ratings_count', 'text_reviews_count', 'publication_date', 'publisher']
- df = pd.read_csv(csv_path, header=None, names=names)
- df = df.head(10)
- df = df.sort_values(by='average_rating', ascending=True)
- plt.figure(figsize=(16, 9))
- y = df['title']
- x = df['average_rating']
- plt.barh(y, x, color='skyblue')
- for idx, value in enumerate(x):
- plt.text(value, idx, f'{value}', va='center', ha='left')
- plt.title('最不受关注的高分书籍 Top 10', fontsize=16, fontweight='bold')
- plt.ylabel('标题', fontsize=16, fontweight='bold')
- plt.xlabel('平均评分', fontsize=16, fontweight='bold')
- plt.tight_layout()
- plt.savefig('../visualization/[5]books_rating_no_attention_top_10.png')
- plt.show()
- def chart_6():
- """
- (6) 统计不同年份出版的书籍数量
- """
- csv_path = '../result/year_books_num.csv/part-r-00000-40448964-0687-400c-94da-d390ca57f2d8.csv'
- names = ['year', 'books_num']
- df = pd.read_csv(csv_path, header=None, names=names)
- plt.figure(figsize=(16, 9))
- x = df['year']
- y = df['books_num']
- plt.plot(x, y, marker='o', linestyle='-', color='skyblue')
- plt.title('不同年份出版的书籍数量', fontsize=16, fontweight='bold')
- plt.xlabel('年份', fontsize=16, fontweight='bold')
- plt.ylabel('书籍数量', fontsize=16, fontweight='bold')
- plt.grid(True, which='both', linestyle='--', linewidth=0.5)
- plt.tight_layout()
- plt.savefig('../visualization/[6]year_books_num.png')
- plt.show()
- def chart_7():
- """
- (7) 统计不同作者的书籍的平均评分
- """
- csv_path = '../result/author_books_avg_rating.csv/part-r-00000-5601e037-dc5b-4cd1-ba34-b2207717cb27.csv'
- names = ['author_first', 'avg_rating', 'books_num']
- df = pd.read_csv(csv_path, header=None, names=names)
- df = df.head(50)
- plt.figure(figsize=(16, 9))
- x = df['author_first']
- y = df['avg_rating']
- bars = plt.bar(x, y, color='skyblue')
- for bar in bars:
- height = bar.get_height()
- plt.text(bar.get_x() + bar.get_width() / 2.0, height, f'{height:.2f}', ha='center', rotation=45)
- plt.title('不同作者的书籍的平均评分', fontsize=16, fontweight='bold')
- plt.xlabel('第一作者', fontsize=16, fontweight='bold')
- plt.ylabel('平均评分', fontsize=16, fontweight='bold')
- plt.xticks(rotation=90)
- plt.tight_layout()
- plt.savefig('../visualization/[7]author_books_avg_rating.png')
- plt.show()
- def chart_8():
- """
- (8) 统计在最受关注的书籍 Top 1000 中, 不同出版社出版的书籍数量
- """
- csv_path = '../result/publisher_books_top_1000_num.csv/part-r-00000-c9d19260-6251-446a-b55a-2f6864e295f7.csv'
- names = ['publisher', 'books_num']
- df = pd.read_csv(csv_path, header=None, names=names)
- df = df.head(10)
- plt.figure(figsize=(9, 9))
- labels = df['publisher']
- x = df['books_num']
- plt.pie(x, labels=labels, autopct='%1.1f%%', startangle=0)
- plt.title('在最受关注的书籍 Top 1000 中, 不同出版社出版的书籍数量')
- plt.tight_layout()
- plt.savefig('../visualization/[8]publisher_books_top_1000_num.png')
- plt.show()
- def chart_9():
- """
- (9) 统计在最受关注的书籍 Top 1000 中, 不同语言的书籍数量
- """
- csv_path = '../result/language_books_top_1000_num.csv/part-r-00000-ecab523d-5d46-4e4f-a5f4-8dc083e049e2.csv'
- names = ['language_code', 'books_num']
- df = pd.read_csv(csv_path, header=None, names=names)
- df = df.head(10)
- plt.figure(figsize=(9, 9))
- labels = df['language_code']
- x = df['books_num']
- wedges = plt.pie(x, labels=None, autopct='%1.1f%%', startangle=0)[0]
- handles = [plt.Rectangle((0, 0), 1, 1, color=mcolors.to_rgba(wedge.get_facecolor())) for wedge in wedges]
- plt.legend(handles, labels, title='语言', loc='upper right', bbox_to_anchor=(0.9, 0.9))
- plt.title('在最受关注的书籍 Top 1000 中, 不同语言的书籍数量')
- plt.tight_layout()
- plt.savefig('../visualization/[9]language_books_top_1000_num.png')
- plt.show()
- def chart_10():
- """
- (10) 统计不同作者的书籍的平均受关注程度
- """
- csv_path = '../result/author_books_avg_attention.csv/part-r-00000-b82eadc1-8db8-494d-befa-34bf3834cfb1.csv'
- names = ['author_first', 'avg_attention', 'books_num']
- df = pd.read_csv(csv_path, header=None, names=names)
- df = df.head(50)
- plt.figure(figsize=(16, 9))
- x = df['author_first']
- y = df['avg_attention']
- bars = plt.bar(x, y, color='skyblue')
- for bar in bars:
- height = bar.get_height()
- plt.text(bar.get_x() + bar.get_width() / 2.0, height, f'{int(height)}', ha='center', rotation=45)
- plt.title('不同作者的书籍的平均受关注程度', fontsize=16, fontweight='bold')
- plt.xlabel('第一作者', fontsize=16, fontweight='bold')
- plt.ylabel('平均受关注程度', fontsize=16, fontweight='bold')
- plt.xticks(rotation=90)
- plt.tight_layout()
- plt.savefig('../visualization/[10]author_books_avg_attention.png')
- plt.show()
- if __name__ == '__main__':
- for i in range(1, 11):
- eval(f'chart_{i}()')
复制代码 (1)统计最受关注的书籍Top 10
(2)统计书籍篇幅Top 10
(3)统计差异出书社出书的书籍数量
(4)统计差异语言的书籍数量
(5)统计最不受关注的高分书籍Top 10(评分4.5以上,评分人数1w以上,批评数200以下)
(6)统计差异年份出书的书籍数量
(7)统计差异作者的书籍的均匀评分
(8)统计在最受关注的书籍Top 1000中,差异出书社出书的书籍数量
(9)统计在最受关注的书籍Top 1000中,差异语言的书籍数量
(10)统计差异作者的书籍的均匀受关注程度
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |