Python TMDB电影数据集多维度关联规则分析(python商务大数据分析)
资源内容介绍
TMDB,全称为The Movie Database(电影数据库),是一个在线的电影数据库网站,它提供了关于电影、电视节目以及演员、导演等电影制作人员的详细信息。TMDB不仅包含了大量的电影数据,还提供了丰富的电影资讯、影评以及用户评分等功能,是电影爱好者和专业人士获取电影信息的重要渠道之一。我们做的事情是:不局限于市面上对这个数据集的简单处理,而是去深度解构了数据集,①对电影的不同属性进行关联规则、②根据电影分类、盈利与收益率关联规则来分析整体利润收益、③根据电影分类和收益率关联规则分析低投入高收益的电影、④以及模拟了随机观众对电影分类评价的关联规则以及聚类分析。适用于python商务大数据需要属性构造和深度分析的场景。 # -*- coding: utf-8 -*-"""Created on Mon Nov 18 17:56:46 2024@author: 62561"""# 导入必要的库from mlxtend.frequent_patterns import apriori, association_rulesimport pandas as pdimport numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsfrom mlxtend.frequent_patterns import apriori, association_rulesfrom sklearn.preprocessing import LabelEncoderfrom sklearn.preprocessing import MultiLabelBinarizer#数据读取movies=pd.read_csv('tmdb_5000_movies.csv')movies.info()movies.isnull().sum()movies.describe()#数据预处理movies.query('budget==0').head(2)movies.drop(['homepage','id','keywords','tagline','overview','spoken_languages','status'],axis=1,inplace = True)movies.shapemovies.dropna(inplace=True)movies.shapemovies.drop_duplicates(keep='first',inplace=True)for col in movies.columns: zero=len(movies[movies[col]==0]) print('{},零售数目:{}'.format(col,zero))movies=movies.drop(index=movies.query('budget==0 or revenue==0').index)movies.shapemovies.describe()#属性构造:1-预算除以电影收入得到新属性:预算收益率movies['ReturnRate']=1-movies['budget']/movies['revenue']movies.describe()#数据信息统计#相关系数矩阵,即给出了任意两个变量之间的相关系数alld=movies.corr()#计算收入与其他字段相关系数movies['release_year']= [i.year for i in pd.to_datetime(movies['release_date'])]a=movies[['budget','popularity','vote_average','vote_count','runtime','release_year','revenue']].corr()#查看收入前十的电影信息top10=movies.sort_values('revenue',ascending = False).head(10) top10[['budget','genres','original_title','release_date','revenue','runtime','vote_average','vote_count','popularity']]#查看电影年份分布movies['release_year'].value_counts()movies['release_year'].hist(bins=25)#查看电影时长分布movies['runtime'].value_counts()movies['runtime'].hist(bins=30)#查看预算分布movies['budget'].hist(bins=15)#查看评分分布movies['vote_average'].hist(bins=20)#查看评论次数分布movies['vote_count'].hist(bins=20)#影响收入的因素分析#绘制电影收入与影响因子散点图plt.rcParams['font.sans-serif'] = [u'SimHei']list1= ['budget','popularity','vote_count','vote_average','runtime','release_year']fig,axes = plt.subplots(2,3,figsize=(12,8),dpi = 70)for i in range(2): for j in range(3): axes[i,j].scatter(x= list1[3*i+j], y='revenue', data=movies,s = 3) axes[i,j].set_title(list1[3*i+j]) fig.suptitle('影响电影要房的因素')fig.show()#分组分析电影收入与影响因子的关系movies=movies[(movies['release_year']>2000)&(movies['release_year']<=2015)]movies.shapeedge=[2000,2005,2010,2015]moviesclass=['2001-2005年','2006-2010年','2011-2015年']movies['fiveyearclass']=pd.cut(movies.loc[:,'release_year'],edge,labels=moviesclass,include_lowest=True)movies.head()edge = movies['revenue'].quantile([0,0.25,0.5,0.75,1]).valuesmoviesclass=['Low','Medium','Moderately High','High']movies['revenueclass']= pd.cut(movies.loc[:,'revenue'],edge, labels=moviesclass, include_lowest = True)movies.head ()d_summary=movies.groupby(['fiveyearclass','revenueclass']).median()d_summarylist1=['budget','popularity','vote_count','vote_average','runtime','release_year']pos=list(range(len(d_summary[:'2001-2005年'])))width=0.2fig,ax=plt.subplots(2,3,figsize=(10,5))plt.subplots_adjust(wspace=0.4,hspace =0.5)for i in range(2): for j in range(3): ax[i,j].bar(pos,d_summary.loc['2001-2005年'][list1[3*i+j]], width, label='2001-2005年') ax[i,j].bar([p + width for p in pos],d_summary.loc['2006-2010年'][list1[3*i+j]],width, label='2006-2010年') ax[i,j].bar([p + width*2 for p in pos], d_summary.loc['2011-2015年'][list1[3*i+j]], width, label='2011-2015年') ax[i,j].set_ylabel(list1[3*i+j]) ax[i,j].set_xlabel('revenue') ax[i,j].set_title('revenue&'+list1[3*i+j]) ax[i,j].set_xticks([p + 1.5 * width for p in pos]) ax[i,j].set_xticklabels(['低','中下','中上','高']) ax[i,j].legend() ax[i,j].grid() #分析电影类型对电影收入的影响def counttype(movies): moviesgeners={} for i in movies.genres: for j in eval(i): if j['name']not in moviesgeners: moviesgeners[j['name']]=1 else: moviesgeners[j['name']]+=1 return moviesgenersplt.figure(figsize=(10,5))moviesgeners=counttype(movies)moviesgeners=list(moviesgeners.items())moviesgeners.sort(key=lambda tup:tup[1],reverse=True)x=[i[0] for i in moviesgeners]y=[i[1] for i in moviesgeners]plt.bar(x,y)plt.xticks(rotation=45)plt.xlabel("电影类型")plt.ylabel("电影数量")plt.title("电影类型分布条形图")plt.show()top100=movies.sort_values('popularity',ascending=False).head(100)moviesgeners=counttype(top100)moviesgenersmoviesclass=['Low','Medium','Moderately High','High']groupmovies=movies.groupby('revenueclass')fig,axes = plt.subplots(2,2,figsize =(10,8))plt.subplots_adjust(wspace=0.2,hspace=0.2)i= 0for j in range(2): for k in range(2): data=groupmovies.get_group(moviesclass[i]) moviesCounttype = counttype(data) moviesCounttype=list(moviesCounttype.items()) moviesCounttype.sort(key=lambda tup: tup[1],reverse = True) moviesCounttype= moviesCounttype[:5] x=[i[0] for i in moviesCounttype] y=[i[1] for i in moviesCounttype] axes[j,k].bar(x,y,width =0.5) axes[j,k].title.set_text(moviesclass[i]) i+= 1 import pandas as pdimport jsonimport randomfrom efficient_apriori import apriori# 确保将 genres 列转换为列表movies['genres'] = movies['genres'].apply(json.loads)# 根据 vote_count 和 popularity 生成模拟交易数据def generate_transactions(movies, num_users=200): transactions = [] for _ in range(num_users): user_movies = movies.sample(frac=1, weights='popularity').head(random.randint(1, 10)) watched_genres = set() for _, row in user_movies.iterrows(): watched_genres.update([genre['name'] for genre in row['genres']]) transactions.append(list(watched_genres)) return transactionsrandom.seed(42)transactions = generate_transactions(movies)# 使用 efficient-apriori 进行关联规则分析min_support = 0.5 # 降低支持度min_confidence = 0.9 # 降低置信度# 生成频繁项集和关联规则itemsets, rules = apriori(transactions, min_support=min_support, min_confidence=min_confidence)# 打印有趣的关联规则for rule in rules: lhs = ', '.join(rule.lhs) # 规则的前提(左侧项集) rhs = ', '.join(rule.rhs) # 规则的结果(右侧项集) print(f"If a user watches {lhs}, they are likely to also watch {rhs} (confidence: {rule.confidence:.2f})")import networkx as nximport matplotlib.pyplot as pltfrom efficient_apriori import apriori# 假设 transactions 是之前代码中准备好的交易数据列表# itemsets, rules = apriori(transactions, min_support=0.05, min_confidence=0.7)# 函数来绘制关联规则网络图def plot_rules(rules): G = nx.DiGraph() # 添加节点和边 for rule in rules: antecedents = rule.lhs # 使用 lhs (left-hand side) 获取前件 consequents = rule.rhs # 使用 rhs (right-hand side) 获取后件 for antecedent in antecedents: G.add_node(antecedent, label=antecedent) for consequent in consequents: G.add_node(consequent, label=consequent) for antecedent in antecedents: for consequent in consequents: G.add_edge(antecedent, consequent, weight=rule.confidence) #