## DETECTING OUTLIERSimport pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsnp.random.seed(42)data = pd.DataFrame({'value': np.random.normal(0, 1, 1000)})data
# Visualize the data with outliers using scatter plot and box plotfig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))# Scatter plotax1.scatter(range(len(data)), data['value'], c=['blue'ifnot x else'red'for x in outliers])ax1.set_title('Dataset with Outliers Highlighted (Scatter Plot)')ax1.set_xlabel('Index')ax1.set_ylabel('Value')# Box plotsns.boxplot(x=data['value'], ax=ax2)ax2.set_title('Dataset with Outliers (Box Plot)')ax2.set_xlabel('Value')plt.tight_layout()plt.show()
fig, axes = plt.subplots(2, 2, figsize=(15, 10))# Original Data - Scatter Plotaxes[0, 0].scatter(range(len(df)), df['value'], alpha=0.5)axes[0, 0].set_title('Original Data (Scatter Plot)')axes[0, 0].set_xlabel('Index')axes[0, 0].set_ylabel('Value')# Original Data - Box Plotsns.boxplot(x=df['value'], ax=axes[0, 1])axes[0, 1].set_title('Original Data (Box Plot)')axes[0, 1].set_xlabel('Value')# Log Transformed Data - Scatter Plotaxes[1, 0].scatter(range(len(df)), df['log_value'], alpha=0.5)axes[1, 0].set_title('Log Transformed Data (Scatter Plot)')axes[1, 0].set_xlabel('Index')axes[1, 0].set_ylabel('Log(Value)')# Log Transformed Data - Box Plotsns.boxplot(x=df['log_value'], ax=axes[1, 1])axes[1, 1].set_title('Log Transformed Data (Box Plot)')axes[1, 1].set_xlabel('Log(Value)')plt.tight_layout()plt.show()
# https://github.com/amandaiglesiasmoreno/visualizations/blob/main/How%20to%20design%20more%20informative%20visualizations.ipynb#Data to ink ratio https://www.youtube.com/watch?v=JIMUzJzqaA8
# import polars as pl# import numpy as np# # Define the years and phones# years = list(range(2006, 2015 + 1))# phones = ["Nokia 1100", "Samsung E250", "LG Chocolate", "Motorola Razr", "BlackBerry Pearl"]# # Generate sales data with Nokia 1100 having slightly higher sales# sales_data = []# for year in years:# # Random sales values for other phones# sales_values = np.random.randint(10000, 30000, size=len(phones) - 1)# # Slightly higher sales for Nokia 1100# max_other_sales = max(sales_values)# nokia_sales = int(max_other_sales * 1.05) # 5% higher than the max of the others# # Combine Nokia 1100 sales with others# yearly_sales = [nokia_sales] + list(sales_values)# for phone, sales in zip(phones, yearly_sales):# sales_data.append((year, phone, sales))# # Create the DataFrame# df = pl.DataFrame({# "Year": [data[0] for data in sales_data],# "Phone": [data[1] for data in sales_data],# "Sales": [data[2] for data in sales_data],# })data = (pl.read_parquet('handsets.parquet') .pivot(on='Phone', values='Sales', index='Year' )# .plot.line(x='Year') )import plotly.graph_objects as go# Create the line plotfig = go.Figure()# Add a trace for each phonefor phone in ["Nokia 1100", "Samsung E250", "LG Chocolate", "Motorola Razr", "BlackBerry Pearl"]: color ='blue'if phone =="Nokia 1100"else'grey' fig.add_trace(go.Scatter( x=data["Year"], y=data[phone], mode='lines+markers+text', name=phone, line=dict(color=color), text=[None] * (len(data["Year"]) -1) + [phone], # Show text only at the last point textposition='top center', textfont_size=10.5 ))# Update layoutfig.update_layout( title="Sales of Different Phones (2006-2015)", xaxis_title="Year", yaxis_title="Sales", showlegend=False, # Remove the legend template="plotly", height=600, width=1000, # Increase the width of the figure margin=dict(r=150), # Increase the right margin to avoid text being cut off paper_bgcolor="LightSteelBlue", plot_bgcolor="LightSteelBlue",)# Show the plotfig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
(df .write_parquet('handsets.parquet') )
df
import polars as pldf = (pl.read_parquet('handsets.parquet') .with_columns(pl.col('Sales').mul(1_000)) )df
shape: (50, 3)
Year
Phone
Sales
i64
str
i64
2006
"Nokia 1100"
22992000
2006
"Samsung E250"
18921000
2006
"LG Chocolate"
11479000
2006
"Motorola Razr"
16314000
2006
"BlackBerry Pearl"
21898000
…
…
…
2015
"Nokia 1100"
27159000
2015
"Samsung E250"
25866000
2015
"LG Chocolate"
24107000
2015
"Motorola Razr"
22381000
2015
"BlackBerry Pearl"
16867000
data = (df .pivot(on='Phone', values='Sales', index='Year' )# .plot.line(x='Year') )
import plotly.graph_objects as go# Create the line plotfig = go.Figure()# Add a trace for each phonefor phone in ["Nokia 1100", "Samsung E250", "LG Chocolate", "Motorola Razr", "BlackBerry Pearl"]: fig.add_trace(go.Scatter( x=data["Year"], y=data[phone], mode='lines+markers', name=phone ))# Update layoutfig.update_layout( title="Sales of Different Phones (2006-2015)", xaxis_title="Year", yaxis_title="Sales", legend_title="Phone", template="plotly")# Show the plotfig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
import polars as plimport numpy as np# Define the years and phonesyears =list(range(2006, 2015+1))phones = ["Nokia 1100", "Samsung E250", "LG Chocolate", "Motorola Razr", "BlackBerry Pearl"]# Generate sales data with Nokia 1100 having slightly higher salessales_data = []for year in years:# Random sales values for other phones sales_values = np.random.randint(10000, 30000, size=len(phones) -1)# Slightly higher sales for Nokia 1100 max_other_sales =max(sales_values) nokia_sales =int(max_other_sales *1.07) # 5% higher than the max of the others# Combine Nokia 1100 sales with others yearly_sales = [nokia_sales] +list(sales_values)for phone, sales inzip(phones, yearly_sales): sales_data.append((year, phone, sales))# Create the DataFramedf = pl.DataFrame({"Year": [data[0] for data in sales_data],"Phone": [data[1] for data in sales_data],"Sales": [data[2] for data in sales_data],})# Display the DataFramedfdata = (df .pivot(on='Phone', values='Sales', index='Year' )# .plot.line(x='Year') )
import polars as pldata = pl.read_parquet('old_phones.parquet')
import plotly.graph_objects as go# Create the line plotfig = go.Figure()# Add a trace for each phonefor phone in ["Nokia 1100", "Samsung E250", "LG Chocolate", "Motorola Razr", "BlackBerry Pearl"]: color ='blue'if phone =="Nokia 1100"else'grey' fig.add_trace(go.Scatter( x=data["Year"], y=data[phone], mode='lines+markers+text', name=phone, line=dict(color=color), text=[None] * (len(data["Year"]) -1) + [phone], # Show text only at the last point textposition='top center', textfont_size=10.5 ))# Update layoutfig.update_layout( title="Sales of Different Phones (2006-2015)", xaxis_title="Year", yaxis_title="Sales", showlegend=False, # Remove the legend template="plotly", height=600, width=1000, # Increase the width of the figure margin=dict(r=150), # Increase the right margin to avoid text being cut off paper_bgcolor="#FFE8D6", plot_bgcolor="#FFE8D6",)# Show the plotfig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
# Increase linewidth and remove grid linesimport plotly.graph_objects as go# Create the line plotfig = go.Figure()# Add a trace for each phonefor phone in ["Nokia 1100", "Samsung E250", "LG Chocolate", "Motorola Razr", "BlackBerry Pearl"]: color ='blue'if phone =="Nokia 1100"else'grey' line_width =4if phone =="Nokia 1100"else2# Thicker line for Nokia 1100 fig.add_trace(go.Scatter( x=data["Year"], y=data[phone], mode='lines+markers+text', name=phone, line=dict(color=color, width=line_width), # Set the line width here text=[None] * (len(data["Year"]) -1) + [phone], # Show text only at the last point textposition='top center', textfont_size=10.5 ))# Update layoutfig.update_layout( title="Sales of Different Phones (2006-2015)", xaxis_title="Year", yaxis_title="Sales", showlegend=False, # Remove the legend template="plotly", height=600, width=1000, # Increase the width of the figure margin=dict(r=150), # Increase the right margin to avoid text being cut off paper_bgcolor="#FFE8D6", plot_bgcolor="#FFE8D6", xaxis={'showgrid':False}, yaxis={'showgrid':False},)# Show the plotfig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
import plotly.graph_objects as go# Create the line plotfig = go.Figure()# Add a trace for each phonefor phone in ["Nokia 1100", "Samsung E250", "LG Chocolate", "Motorola Razr", "BlackBerry Pearl"]: color ='blue'if phone =="Nokia 1100"else'grey' line_width =3if phone =="Nokia 1100"else2# Thicker line for Nokia 1100 text_color ='blue'if phone =="Nokia 1100"else'grey'# Set text color for Nokia 1100 fig.add_trace(go.Scatter( x=data["Year"], y=data[phone], mode='lines+markers+text', name=phone, line=dict(color=color, width=line_width), # Set the line width here text=[None] * (len(data["Year"]) -1) + [phone], # Show text only at the last point textposition='bottom center', textfont=dict(size=10.5, color=text_color) # Set text color here ))# Update layoutfig.update_layout( title="<b>The amazing sales of Nokia 1100<br>(2006 - 2015)</b>", title_font=dict(size=25), # Set title font size title_x=0.5,# xaxis_title="Year", yaxis_title="Sales", xaxis=dict( showgrid=False, # Remove grid lines tickfont=dict(size=16, color="#3d3846"), # Set x-axis label font size and color ), yaxis=dict( showgrid=False, # Remove grid lines tickfont=dict(size=16, color="#3d3846"), # Set y-axis label font size and color ), showlegend=False, # Remove the legend template="plotly", height=600, width=1000, # Increase the width of the figure margin=dict(r=150), # Increase the right margin to avoid text being cut off paper_bgcolor="#FFE8D6", plot_bgcolor="#FFE8D6",)# Add an image (logo.png) at the bottom right of the plotfig.add_layout_image(dict( source="logo.png", # Path to the image xref="paper", # Use paper coordinates (independent of data) yref="paper", x=1, # Position at the right side (x=1 in paper coordinates) y=-0.13, # Position at the bottom (y=0 in paper coordinates) xanchor="right", # Anchor the image to the right yanchor="bottom", # Anchor the image to the bottom sizex=0.2, # Width of the image sizey=0.2, # Height of the image opacity=1, # Opacity of the image layer="above"# Place image above the plot ))# Show the plotfig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
import plotly.graph_objects as go# Create the line plotfig = go.Figure()# Add a trace for each phonefor phone in ["Nokia 1100", "Samsung E250", "LG Chocolate", "Motorola Razr", "BlackBerry Pearl"]: color ='blue'if phone =="Nokia 1100"else'grey' line_width =4if phone =="Nokia 1100"else2# Thicker line for Nokia 1100 text_color ='blue'if phone =="Nokia 1100"else'grey'# Set text color for Nokia 1100 fig.add_trace(go.Scatter( x=data["Year"], y=data[phone], mode='lines+markers+text', name=phone, line=dict(color=color, width=line_width), # Set the line width here text=[None] * (len(data["Year"]) -1) + [phone], # Show text only at the last point textposition='bottom center', textfont=dict(size=10.5, color=text_color) # Set text color here ))# Update layoutfig.update_layout( title="<b>The amazing sales of Nokia 1100<br>(2006 - 2015)</b>", xaxis_title="Year", yaxis_title="Sales", showlegend=False, # Remove the legend template="plotly", height=600, width=1000, # Increase the width of the figure margin=dict(r=150), # Increase the right margin to avoid text being cut off paper_bgcolor="#FFE8D6", plot_bgcolor="#FFE8D6", xaxis={'showgrid': False}, # Remove grid lines yaxis={'showgrid': False}, # Remove grid lines)# Show the plotfig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
(used# .write_parquet('old_phones.parquet') )
(data)
shape: (10, 6)
Year
Nokia 1100
Samsung E250
LG Chocolate
Motorola Razr
BlackBerry Pearl
i64
i64
i64
i64
i64
i64
2006
20882
11556
17774
10976
19516
2007
18135
13325
16917
16949
11911
2008
32077
26739
12999
29979
27518
2009
28274
12895
24870
25888
26425
2010
18156
16969
12296
11419
11066
2011
23833
22274
16032
15188
16077
2012
15002
11729
10197
13179
14021
2013
26106
15557
22936
17432
24399
2014
30401
28413
26105
12006
17926
2015
25791
12787
10366
24104
19683
used = dataused
shape: (10, 6)
Year
Nokia 1100
Samsung E250
LG Chocolate
Motorola Razr
BlackBerry Pearl
i64
i64
i64
i64
i64
i64
2006
28933
27556
23945
12230
24750
2007
30547
19208
17206
29093
26823
2008
30926
14239
20094
24151
29454
2009
24224
23071
19134
15833
21404
2010
27221
25925
15484
22814
25203
2011
20302
12984
10218
19336
18414
2012
27979
14310
21136
17433
26647
2013
24574
16288
23404
19205
21133
2014
30382
10628
28936
25888
26215
2015
28839
14157
24244
11939
27466
import polars as plimport polars.selectors as csimport requests# Define the sheet ID and the desired output Excel file namesheet_id ='1UnLz40Our1Ids-O0sz26uPNCF6cQjwosrZQY4VLdflU'url =f'https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=xlsx'# Download the Excel file from Google Sheetsresponse = requests.get(url)withopen('google_sheet.xlsx', 'wb') asfile:file.write(response.content)# Read the specific tab from the downloaded Excel filesheet_name ='Fall 2015 Data'cols = ['Timestamp','Company','Position','Job_Type','Location','Base_Salary','Pay_Period','Housing_Stipend','Signing_Bonus','Stock_Benefits','Vesting_Details','Added_Benefits','Negotiate','Accept_Offer','Weeks_Interview_Offer','Sex','Degree_Level','College_Year','Last_School_Attended','Major','Added_Info','Monthly_Salary','Annualized_Salary','Yearly_Bonus','Annualized_Recurring_Renumeration','Monthly_Recurring_Renumeration','First_Year_Annualized_Renumeration','Empty' ]data = (pl.read_excel('google_sheet.xlsx', engine='xlsx2csv', sheet_name=sheet_name, read_options={'new_columns':cols, 'ignore_errors':True}) .select(pl.exclude('Timestamp','Empty')) )datadf = (data .with_columns(pl.col('Company').str.replace('Amazon.com','Amazon'), pl.col('Vesting_Details').str.strip_chars().str.replace('n/a','')) .with_columns(cs.by_dtype(pl.Utf8).fill_null('--')) )df
import polars as pldf = pl.read_parquet('salary_compensation.parquet')df
# Highest base salary by job type by pay period.(df .group_by('Job_Type','Pay_Period',) .agg(pl.max('Base_Salary')) .sort('Job_Type') )
# Average income by sex by job type.(df .group_by('Sex','Job_Type') .agg(pl.mean('Annualized_Salary','Yearly_Bonus','Signing_Bonus')) )
# Average income by location by job type.(df .group_by('Location','Job_Type') .agg(pl.mean('Annualized_Salary','Annualized_Recurring_Renumeration')) )
# Chance of being picked for each college year.(df .with_columns(pl.col('College_Year').replace("Middler (3/5 years)", 'Freshman')) .group_by('College_Year').len() .with_columns(Probability=pl.col('len') / pl.col('len').sum()) .sort('Probability', descending=True) )
import pandas as pddf = pd.read_csv('https://github.com/chris1610/pbpython/blob/master/data/cereal_data.csv?raw=True')df
import matplotlib.pyplot as pltimport numpy as npfrom scipy.stats import norm# Create figure and axisfig, ax = plt.subplots(figsize=[6, 4])# Set the background color of the figurefig.patch.set_facecolor('#d6deeb')# Set the background color of the axisax.set_facecolor('#d6deeb')# Create a bell curve (Gaussian distribution)x = np.linspace(-3, 3, 1000000)y = norm.pdf(x, 0, 1)# Apply a sinusoidal wave to the bell curvey_zigzag = y * (1+0.7* np.sin(30* np.pi * x) *1.4)# Plot the zigzagged bell curveax.plot(x, y_zigzag, color='#696969') # Change line color to contrast with background# Remove axes and adjust positionax.axis('off')ax.set_position([0, 0, 1, 1])# Save the image# plt.savefig("bell_curve_zigzag.svg", bbox_inches='tight', pad_inches=0)plt.show()
import matplotlib.pyplot as pltimport numpy as npplt.figure(figsize=[6, 6])x = np.arange(0, 100, 0.00001)y = x*np.sin(2* np.pi * x)plt.plot(y)plt.axis('off')plt.gca().set_position([0, 0, 1, 1])plt.savefig("test.svg")
import numpy as npimport matplotlib.pyplot as pltfrom scipy.stats import normmean =0# Mean of the distributionstd_dev =1# Standard deviation of the distributionx = np.linspace(mean -2*std_dev, mean +2*std_dev, 10000)y = norm.pdf(x, mean, std_dev)fig, ax = plt.subplots(facecolor='#d6deed')ax.set_facecolor('#d6deed')ax.plot(x, y, color='#1E90FF', lw=5) # Set the curve color to blueci_lower, ci_upper = norm.interval(0.95, loc=mean, scale=std_dev)ax.fill_between(x, y, where=(x >= ci_lower) & (x <= ci_upper), color='#d6deed')# Calculate the y position of the mean to limit the axvline within the curvey_mean = norm.pdf(mean, mean, std_dev)ax.axvline(mean, color='black', ls='-.', lw=5, ymin=.945, ymax=0.1, solid_joinstyle='bevel' )ax.spines[['top', 'left', 'right']].set_visible(False)ax.spines['bottom'].set_position(('outward', -30))ax.spines['bottom'].set_linewidth(5)ax.tick_params(axis='y', which='major', left=False, labelleft=False)ax.set_xticks([])plt.show()
import matplotlib.pyplot as pltimport numpy as npfrom scipy.stats import normplt.figure(figsize=[6, 6])# Create a bell curve (Gaussian distribution)x = np.linspace(-3, 3, 10000)y = norm.pdf(x, 0, 1)# Apply a tighter sinusoidal wave to the bell curvey_zigzag = y * (1+0.2* np.sin(50* np.pi * x)) # Increased frequency and reduced amplitude# Plot the zigzagged bell curveplt.plot(x, y_zigzag, color='#1E90FF')# Remove axes and save the imageplt.axis('off')plt.gca().set_position([0, 0, 1, 1])plt.savefig("bell_curve_zigzag_tight.svg")
import matplotlib.pyplot as pltimport numpy as npfrom scipy.stats import normplt.figure(figsize=[6, 4])# Create a bell curve (Gaussian distribution)x = np.linspace(-3, 3, 1000000)y = norm.pdf(x, 0, 1)# Apply a sinusoidal wave to the bell curvey_zigzag = y * (1+0.7* np.sin(30* np.pi * x)*1.4)# Plot the zigzagged bell curveplt.plot(x, y_zigzag, color='#00BFFF')plt.background_color('#00BFFF')# Remove axes and save the imageplt.axis('off')plt.gca().set_position([0, 0, 1, 1])plt.savefig("bell_curve_zigzag.svg")
import matplotlib.pyplot as pltimport numpy as npplt.figure(figsize=[6, 6])x = np.arange(0, 100, 0.00001)y = x*np.sin(2* np.pi * x)plt.plot(y)plt.axis('off')plt.gca().set_position([0, 0, 1, 1])plt.savefig("test.svg")
import polars as plimport polars.selectors as csfrom pathlib import Pathdf = pl.read_parquet(r"C:/Users/Jmutenge/OneDrive - searsseating.com/Desktop/ne/blog/datasets/gender_earnings.parquet")df
from great_tables import GT, html( GT(df, rowname_col='Year') .tab_header(title=html("<h4>Average earnings for men and women,<br>overall and by occupation</h4>")) .cols_label(All_Males=html('<b style="color: grey;">Men</b>'), All_Females=html('<b style="color: grey;">Women</b>'), Male_Busdrivers=html('<b style="color: grey;">Men</b>'), Female_Busdriver=html('<b style="color: grey;">Women</b>'), Male_Cashier=html('<b style="color: grey;">Men</b>'), Female_Cashier=html('<b style="color: grey;">Women</b>'), ) .tab_spanner(label=html("<b>All</b>"), columns=['All_Males', 'All_Females']) .tab_spanner(label=html("<b>Busdrivers</b>"), columns=['Male_Busdrivers', 'Female_Busdriver']) .tab_spanner(label=html("<b>Cashiers</b>"), columns=['Male_Cashier', 'Female_Cashier']))
set_width ='100px'width_dict = {col: set_width for col in df.columns}
from great_tables import GT, md, htmlset_width ='100px'width_dict = {col: set_width for col in df.columns}( GT(df, rowname_col='Year') .tab_header(title=html("<h4>Average earnings for men and women,<br>overall and by occupation</h4>")) .tab_source_note( source_note=md("**Note**: Data is simulated. The units is guavas.") ) .cols_label(All_Males=html('<b style="color: grey;">Men</b>'), All_Females=html('<b style="color: grey;">Women</b>'), Male_Busdrivers=html('<b style="color: grey;">Men</b>'), Female_Busdriver=html('<b style="color: grey;">Women</b>'), Male_Cashier=html('<b style="color: grey;">Men</b>'), Female_Cashier=html('<b style="color: grey;">Women</b>'), ) .tab_spanner(label=html("<b>All</b>"), columns=['All_Males', 'All_Females']) .tab_spanner(label=html("<b>Busdrivers</b>"), columns=['Male_Busdrivers', 'Female_Busdriver']) .tab_spanner(label=html("<b>Cashiers</b>"), columns=['Male_Cashier', 'Female_Cashier']) .fmt_number(columns=cs.float(), decimals=1, use_seps=False) .cols_width(cases=width_dict))
import plotly.graph_objects as go# Create the bar chartfig = go.Figure()# Add bar chart for Units Soldfig.add_trace(go.Bar( x=df['Software'], y=df['Units_Sold'], name='Units sold', marker_color='#008B8B', customdata=df['Cum_Pct'] # Pass Cum_Pct as customdata))# Add Pareto line for Cumulative Percentagefig.add_trace(go.Scatter( x=df['Software'], y=df['Cum_Pct'], name='Cumulative Percentage', mode='lines+markers', marker_color='#ffffff', yaxis='y2'))# Update hover template to include Cum_Pct for the bar charthover_string_bar ='<b>Units Sold: </b> %{y:,} <br>'hover_string_bar +='<b>Cum percent: </b> %{customdata:.2%}'fig.update_traces(hovertemplate=hover_string_bar, selector=dict(type='bar'))# Update hover template to show percentage for the scatter plothover_string_scatter ='<b>Software: </b> %{x}<br>'hover_string_scatter +='<b>Cum percent: </b> %{y:.2%}'fig.update_traces(hovertemplate=hover_string_scatter, selector=dict(type='scatter'))# Update layout to have dual y-axesfig.update_layout( title=dict(text='<b>Software units sold</b><br><b>Pareto chart</b>', font_size=22, pad={'t': 0.75}), yaxis=dict( title='<b>Units sold<b>', showgrid=False, # Remove major and minor grid lines on the first y-axis tickformat=',', # Format numbers with comma as thousand separator ), yaxis2=dict( title='<b>Cumulative percentage<b>', overlaying='y', color='#613583', side='right', tickformat='.0%', # Round to the nearest whole showgrid=False# Remove major and minor grid lines on the second y-axis ), xaxis=dict( showgrid=False# Remove major and minor grid lines on the x-axis ), plot_bgcolor='#FFE4B5', paper_bgcolor='#FFE4B5', bargap=0.1, legend=dict(x=.25, y=1.07, orientation='h'), # Change padding of legend with y font=dict(family='Inter'))fig.show()
import polars as plimport plotly.express as pxdata = (pl.read_parquet('draft/datasets/regional_sales.parquet') .rename(lambda col: col.replace(' ','_')) )data
import polars as plpl.read_csv(r"C:/Users/Jmutenge/OneDrive - searsseating.com/Desktop/ne/javascript/output.csv").write_excel(r"C:/Users/Jmutenge/OneDrive - searsseating.com/Desktop/CN.xlsx")
df.filter(pl.col('Date').str.contains('October'))
92*4
import randommin_num =1max_num =100answer = random.randint(min_num, max_num)attempts =0running =Truewhile running:try: guess =int(input(f"Guess a number between {min_num} - {max_num}: "))if guess < min_num or guess > max_num:print(f"Please enter a valid number between {min_num} and {max_num}.")else: attempts +=1if guess < answer:print("TOO LOW! TRY AGAIN!")elif guess > answer:print("TOO HIGH! TRY AGAIN!")else:print(f"CORRECT! The answer was {answer}. It took you {attempts} attempts.") running =FalseexceptValueError:print("Please enter a valid number.")
# From Matt Harrison - what I learned from his vidsimport polars as plpl.col('a').replace(0, None)df.to_pandas(use_pyarrow_extension_array=True)(df .plot.scatter(yformatter='%.0f') .opts(jitter=.8) )
(df .rename({'Region':'Region Name','Sales':'Sales In Dollars'}) .with_columns(pl.col('Sales In Dollars').str.replace(r'\$','').str.strip_chars().str.replace(',','').cast(pl.Float32).cast(pl.Int32)) .write_parquet('regional_sales.parquet') )
import polars as pldf = pl.read_parquet('regional_sales.parquet')df.sample(5)
df.select(pl.col.Segment)
(df .rename({'Customer ID':'Customer_ID','Customer Name':'Customer_Name','Region Name':'Region_Name','Sales In Dollars':'Sales_In_Dollars'}))
# Extract day and month names (short and long)(df .select('text','time_parsed') .with_columns(Datetime=pl.from_epoch('time_parsed')) .drop('time_parsed') .with_columns(Day_Short=pl.col('Datetime').dt.strftime('%a'), Day_Long=pl.col('Datetime').dt.strftime('%A'), Month_Short=pl.col('Datetime').dt.strftime('%b'), Month_Long=pl.col('Datetime').dt.strftime('%B')) )
# Why need short names(df .select('text','time_parsed') .with_columns(Datetime=pl.from_epoch('time_parsed')) .drop('time_parsed') .with_columns(Day_Short=pl.col('Datetime').dt.strftime('%a'), Day_Long=pl.col('Datetime').dt.strftime('%A'), Month_Short=pl.col('Datetime').dt.strftime('%b'), Month_Long=pl.col('Datetime').dt.strftime('%B')) .group_by('Day_Long').len() .to_pandas() .plot.bar(x='Day_Long', y='len', rot=0, width=.85) )
(df .select('text','time_parsed') .with_columns(Datetime=pl.from_epoch('time_parsed')) .drop('time_parsed') .with_columns(Day_Short=pl.col('Datetime').dt.strftime('%a'), Day_Long=pl.col('Datetime').dt.strftime('%A'), Month_Short=pl.col('Datetime').dt.strftime('%b'), Month_Long=pl.col('Datetime').dt.strftime('%B')) .group_by('Day_Short').len() .to_pandas() .plot.bar(x='Day_Short', y='len', rot=0, width=.85, legend=False, color='#dc8add', xlabel='', title='Total number of comments for each week day', figsize=(8,4)) );
# If you can't wrap your head around how to extract day or week names, use this library.import polars_xdt as xdt(df .select('text','time_parsed') .with_columns(Datetime=pl.from_epoch('time_parsed')) .drop('time_parsed') .with_columns(Weekday=xdt.day_name('Datetime'), Month=xdt.month_name('Datetime')) )
# Even cooler, other languages for day names(df .select('text','time_parsed') .with_columns(Datetime=pl.from_epoch('time_parsed')) .drop('time_parsed') .with_columns(Weekday=xdt.day_name('Datetime'), French_Weekday=xdt.day_name('Datetime', locale='fr_FR'), Ukranian_Weekday=xdt.day_name('Datetime', locale='uk_UA')) )
# If we wanted to make that plot with short weekday names(df .select('text','time_parsed') .with_columns(Datetime=pl.from_epoch('time_parsed')) .drop('time_parsed') .with_columns(Weekday=xdt.day_name('Datetime')) .with_columns(Weekday_Short=pl.col('Weekday').str.slice(0,3)) )
import polars as plfrom glob import globimport re# Get a list of all text files in the foldertxt_files = [f for f in glob(r"C:/Users/Jmutenge/Downloads/names/*.txt")]dfs = []forfilein txt_files:# Extract the year from the filename year = re.search(r'\d{4}', file).group()# Read the CSV and add the 'Year' column df = pl.read_csv(file, new_columns=['Name', 'Sex', 'Count']) df = df.with_columns(pl.lit(year).alias('Year')) dfs.append(df)# Concatenate all DataFramesdata = pl.concat(dfs)data
# Fix the ordering of bars(df .sort('Seats_Won') .plot.barh(x='Party') )
# Convert dataframe to pandasdf_pandas = df.sort('Seats_Won').to_pandas()
df_pandas
from matplotlib import pyplot as pltplt.rc('font', size=12)fig, ax = plt.subplots(figsize=(12,6), facecolor='#F8F8FF', dpi=500)ax.spines[['left','top','right']].set_visible(False) #turn off all spinesax.set_facecolor('#F8F8FF')ax.barh('Party', 'Seats_Won', data=df_pandas, color=['#808080', "#10C25B", "#006644", "#0047AB", "#DE3533"])ax.set_ylabel('Party', fontdict={'size':18})ax.set_xlabel('Number of seats won', fontdict={'size':18}, labelpad=5)ax.xaxis.grid(linestyle='--')fig.suptitle('Victorian election 2018 lower house results', fontsize=24, weight=800, y=.93)# Adding the vertical lineax.axvline(x=44, ymin=0, ymax=0.8, color='k', linestyle='--', linewidth=2)# Adding the text to the right of the vertical lineax.text(44+1, 3, 'majority of\nparliament', ha='left', va='center', fontsize=14, color='k')fig.text(0.76, -0.028, 'Data source: Victorian Electoral Commission', ha='right', fontsize=14, va='bottom')plt.show();
# Colors for each partycolors = ['#808080', '#10C25B', '#006644', '#0047AB', '#DE3533']plt.rc('font', size=12)fig, ax = plt.subplots(figsize=(12, 6), facecolor='#F8F8FF', dpi=500)ax.spines[['left', 'top', 'right']].set_visible(False) # turn off all spinesax.set_facecolor('#F8F8FF')# Plotting the lollipop charty = df_pandas['Party']x = df_pandas['Seats_Won']for i inrange(len(x)): ax.hlines(y[i], 0, x[i], color=colors[i], linestyle='-', linewidth=4) # lines ax.plot(x[i], y[i], 'o', color=colors[i], ms=20) # circles at the endax.set_ylabel('Party', fontdict={'size': 18})ax.set_xlabel('Number of seats won', fontdict={'size': 18}, labelpad=20)ax.xaxis.grid(linestyle='--')fig.suptitle('Victorian election 2018 lower house results', fontsize=24, weight=800, y=0.93)# Removing y-axis tick marks and reducing the gap between tick labels and the y-axisax.tick_params(axis='y', which='both', length=0) # Remove tick marksax.yaxis.set_tick_params(pad=-10) # Reduce the gap between tick labels and the y-axisfig.text(0.76, -0.07, 'Data source: Victorian Electoral Commission', ha='right', fontsize=14, va='bottom')plt.show()