Prepared by Karl Duckett - April 2021
If you don't know who Wilco are, have a listen on Spotify https://open.spotify.com/artist/2QoU3awHVdcHS8LrZEKvSM
# Not all of these are used in this report, but it's my standard copy and paste for each Jupyter Notebook developed.
import pandas as pd
import numpy as np
import seaborn as sns
import os
import re
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import plotly.offline as py
import plotly.graph_objs as go
import plotly.express as px
from numbers import Number
from tabulate import tabulate
from scipy import stats
import datetime
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
pd.options.plotting.backend = "plotly"
pd.options.display.max_columns = 100
pd.options.display.max_colwidth = None
py.init_notebook_mode(connected=True)
# Import the data - this data was extract via Wikipedia + Lyric Genius website
df = pd.read_csv('Wilco.csv')
Create new columns change minutes to seconds and extracting the word count.
def time_convert(x):
m,s = map(int,x.split(':'))
return (m)*60+s
df['Seconds'] = df['Duration'].apply(time_convert)
df['WordCount'] = df['Lyrics'].str.split().str.len()
df.head(2)
text = " ".join(review for review in df['Lyrics'])
stopwords = set(STOPWORDS)
stopwords.update(["know", "go", "want", "will", "see"])
import random
def grey_color_func(word, font_size, position, orientation, random_state=None,
**kwargs):
return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)
# % time will return how long it took to execute this line (only line!)
%time wordcloud = WordCloud(stopwords=stopwords, background_color="white", width=1600,height=900).generate(text)
plt.figure(figsize = (32,18))
plt.imshow(wordcloud.recolor(random_state=3),
interpolation="bilinear")
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
Word cloud of all song titles
text = " ".join(review for review in df['Lyrics'])
text = text.split()
text = [x.upper() for x in text];
def wordListToFreqDict(text):
wordfreq = [text.count(p) for p in text]
return dict(list(zip(text,wordfreq)))
word_dict = wordListToFreqDict(text)
def sortFreqDict(word_dict):
aux = [(word_dict[key], key) for key in word_dict]
aux.sort()
aux.reverse()
return aux
total_words = sortFreqDict(word_dict);
print('Total words in all Wilco songs: ' + str(len(total_words)))
df.set_index('Title', inplace=True)
df.reset_index(inplace=True)
fig = px.scatter(df, x="Seconds", y="WordCount", color="Album", hover_name='Title')
fig.show()
from plotly.subplots import make_subplots
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
go.Bar(
x=df["Title"],
y=df["WordCount"],
name='Words',
showlegend=False),
secondary_y=False
)
fig.add_trace(
go.Scatter(
x=df['Title'],
y=df['Seconds'],
mode="lines",
name='Seconds',
line=go.scatter.Line(color="#ff4f5b"),
showlegend=False),
secondary_y=True
)
fig.update_layout(
title_text='Word Count and Duration by Title',
height=600)
fig.update_traces(marker_color='#ffcf4d')
fig.update_yaxes(title_text="Word Count", secondary_y=False)
fig.update_yaxes(title_text="Duration (seconds)", secondary_y=True)
fig.update_xaxes(showticklabels=False)
fig.update_layout(title_text='Word Count & Duration by Song', title_x=0.5)
fig.show()
album_totals = df.groupby(['Album']).sum()
album_totals.drop(['Tempo', 'Key', 'Chords'], axis=1, inplace=True)
album_totals
album_totals.plot()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
go.Bar(
x=album_totals.index,
y=album_totals["WordCount"],
name='Words',
showlegend=False),
secondary_y=False
)
fig.add_trace(
go.Scatter(
x=album_totals.index,
y=album_totals['Seconds'],
mode="lines",
line=go.scatter.Line(color="gray"),
showlegend=False,
name='Seconds'),
secondary_y=True
)
fig.update_layout(
xaxis_title="Album Name",
title_text='Word Count and Duration by Title',
height=600)
fig.update_yaxes(title_text="Word Count", secondary_y=False)
fig.update_yaxes(title_text="Duration (seconds)", secondary_y=True)
fig.update_layout(hovermode="x unified")
fig.update_layout(title_text='Word Count & Total Duration by Album', title_x=0.5)
fig.show()
By manipulating some of the charts generated above in SVG format and a little magic in Photoshop we have a few design options on how to present the data.
There we go! Ready to hang upon any Wilco fanatics wall.
Explore more coding projects and data analysis at www.karlduckett.com. If you would like the large scale version of the poster and alternative designs, use the contact me section and get in touch :)