## DETECTING OUTLIERS

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)

data = pd.DataFrame({
    'value': np.random.normal(0, 1, 1000)
})
data
value
0 0.496714
1 -0.138264
2 0.647689
3 1.523030
4 -0.234153
... ...
995 -0.281100
996 1.797687
997 0.640843
998 -0.571179
999 0.572583

1000 rows × 1 columns

# Detect outliers using IQR
def detect_outliers_iqr(data):
    Q1 = data.quantile(.25)
    Q3 = data.quantile(.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (data < lower_bound) | (data > upper_bound)

outliers = detect_outliers_iqr(data['value'])
print(f"Number of outliers detected: {sum(outliers)}")
Number of outliers detected: 8
# Visualize the data with outliers using scatter plot and box plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Scatter plot
ax1.scatter(range(len(data)), data['value'], c=['blue' if not x else 'red' for x in outliers])
ax1.set_title('Dataset with Outliers Highlighted (Scatter Plot)')
ax1.set_xlabel('Index')
ax1.set_ylabel('Value')

# Box plot
sns.boxplot(x=data['value'], ax=ax2)
ax2.set_title('Dataset with Outliers (Box Plot)')
ax2.set_xlabel('Value')

plt.tight_layout()
plt.show()

# Remove outliers
data_cleaned = data[~outliers]

print(f"Original dataset size: {len(data)}")
print(f"Cleaned dataset size: {len(data_cleaned)}")

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Scatter plot
ax1.scatter(range(len(data_cleaned)), data_cleaned['value'])
ax1.set_title('Dataset After Removing Outliers (Scatter Plot)')
ax1.set_xlabel('Index')
ax1.set_ylabel('Value')

# Box plot
sns.boxplot(x=data_cleaned['value'], ax=ax2)
ax2.set_title('Dataset After Removing Outliers (Box Plot)')
ax2.set_xlabel('Value')

plt.tight_layout()
plt.show()
Original dataset size: 1000
Cleaned dataset size: 992

# Capping

def cap_outliers(data, lower_percentile=5, upper_percentile=95):
    lower_limit = np.percentile(data, lower_percentile)
    upper_limit = np.percentile(data, upper_percentile)
    return np.clip(data, lower_limit, upper_limit)

data['value_capped'] = cap_outliers(data['value'])

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Scatter plot
ax1.scatter(range(len(data)), data['value_capped'])
ax1.set_title('Dataset After Capping Outliers (Scatter Plot)')
ax1.set_xlabel('Index')
ax1.set_ylabel('Value')

# Box plot
sns.boxplot(x=data['value_capped'], ax=ax2)
ax2.set_title('Dataset After Capping Outliers (Box Plot)')
ax2.set_xlabel('Value')

plt.tight_layout()
plt.show()

# Imputing Outliers

data['value_imputed'] = data['value'].copy()
median_value = data['value'].median()
data.loc[outliers, 'value_imputed'] = median_value

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Scatter plot
ax1.scatter(range(len(data)), data['value_imputed'])
ax1.set_title('Dataset After Imputing Outliers (Scatter Plot)')
ax1.set_xlabel('Index')
ax1.set_ylabel('Value')

# Box plot
sns.boxplot(x=data['value_imputed'], ax=ax2)
ax2.set_title('Dataset After Imputing Outliers (Box Plot)')
ax2.set_xlabel('Value')

plt.tight_layout()
plt.show()

# Transformations - Right-skewed data

# Generate right-skewed data
np.random.seed(42)
data = np.random.exponential(scale=2, size=1000)
df = pd.DataFrame(data, columns=['value'])

# Apply Log Transformation (shifted to avoid log(0))
df['log_value'] = np.log1p(df['value'])
df
value log_value
0 0.938536 0.661933
1 6.020243 1.948798
2 2.633491 1.290194
3 1.825885 1.038822
4 0.339250 0.292110
... ... ...
995 0.192101 0.175718
996 4.985400 1.789323
997 0.294261 0.257940
998 6.000981 1.946050
999 1.181202 0.779876

1000 rows × 2 columns

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Original Data - Scatter Plot
axes[0, 0].scatter(range(len(df)), df['value'], alpha=0.5)
axes[0, 0].set_title('Original Data (Scatter Plot)')
axes[0, 0].set_xlabel('Index')
axes[0, 0].set_ylabel('Value')

# Original Data - Box Plot
sns.boxplot(x=df['value'], ax=axes[0, 1])
axes[0, 1].set_title('Original Data (Box Plot)')
axes[0, 1].set_xlabel('Value')

# Log Transformed Data - Scatter Plot
axes[1, 0].scatter(range(len(df)), df['log_value'], alpha=0.5)
axes[1, 0].set_title('Log Transformed Data (Scatter Plot)')
axes[1, 0].set_xlabel('Index')
axes[1, 0].set_ylabel('Log(Value)')

# Log Transformed Data - Box Plot
sns.boxplot(x=df['log_value'], ax=axes[1, 1])
axes[1, 1].set_title('Log Transformed Data (Box Plot)')
axes[1, 1].set_xlabel('Log(Value)')

plt.tight_layout()
plt.show()

# https://github.com/amandaiglesiasmoreno/visualizations/blob/main/How%20to%20design%20more%20informative%20visualizations.ipynb

#Data to ink ratio https://www.youtube.com/watch?v=JIMUzJzqaA8
# import polars as pl
# import numpy as np

# # Define the years and phones
# years = list(range(2006, 2015 + 1))
# phones = ["Nokia 1100", "Samsung E250", "LG Chocolate", "Motorola Razr", "BlackBerry Pearl"]

# # Generate sales data with Nokia 1100 having slightly higher sales
# sales_data = []

# for year in years:
#     # Random sales values for other phones
#     sales_values = np.random.randint(10000, 30000, size=len(phones) - 1)
    
#     # Slightly higher sales for Nokia 1100
#     max_other_sales = max(sales_values)
#     nokia_sales = int(max_other_sales * 1.05)  # 5% higher than the max of the others
    
#     # Combine Nokia 1100 sales with others
#     yearly_sales = [nokia_sales] + list(sales_values)
    
#     for phone, sales in zip(phones, yearly_sales):
#         sales_data.append((year, phone, sales))

# # Create the DataFrame
# df = pl.DataFrame({
#     "Year": [data[0] for data in sales_data],
#     "Phone": [data[1] for data in sales_data],
#     "Sales": [data[2] for data in sales_data],
# })

data = (pl.read_parquet('handsets.parquet')
 .pivot(on='Phone',
        values='Sales',
        index='Year'
        )
#  .plot.line(x='Year')
 )


import plotly.graph_objects as go

# Create the line plot
fig = go.Figure()

# Add a trace for each phone
for phone in ["Nokia 1100", "Samsung E250", "LG Chocolate", "Motorola Razr", "BlackBerry Pearl"]:
    color = 'blue' if phone == "Nokia 1100" else 'grey'
    fig.add_trace(go.Scatter(
        x=data["Year"],
        y=data[phone],
        mode='lines+markers+text',
        name=phone,
        line=dict(color=color),
        text=[None] * (len(data["Year"]) - 1) + [phone],  # Show text only at the last point
        textposition='top center',
        textfont_size=10.5
    ))

# Update layout
fig.update_layout(
    title="Sales of Different Phones (2006-2015)",
    xaxis_title="Year",
    yaxis_title="Sales",
    showlegend=False,  # Remove the legend
    template="plotly",
    height=600,
    width=1000,  # Increase the width of the figure
    margin=dict(r=150),  # Increase the right margin to avoid text being cut off
    paper_bgcolor="LightSteelBlue",
    plot_bgcolor="LightSteelBlue",
)

# Show the plot
fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
(df
 .write_parquet('handsets.parquet')
 )
df
import polars as pl
df = (pl.read_parquet('handsets.parquet')
      .with_columns(pl.col('Sales').mul(1_000))
      )
df
shape: (50, 3)
Year Phone Sales
i64 str i64
2006 "Nokia 1100" 22992000
2006 "Samsung E250" 18921000
2006 "LG Chocolate" 11479000
2006 "Motorola Razr" 16314000
2006 "BlackBerry Pearl" 21898000
2015 "Nokia 1100" 27159000
2015 "Samsung E250" 25866000
2015 "LG Chocolate" 24107000
2015 "Motorola Razr" 22381000
2015 "BlackBerry Pearl" 16867000
data = (df
 .pivot(on='Phone',
        values='Sales',
        index='Year'
        )
#  .plot.line(x='Year')
 )
import plotly.graph_objects as go

# Create the line plot
fig = go.Figure()

# Add a trace for each phone
for phone in ["Nokia 1100", "Samsung E250", "LG Chocolate", "Motorola Razr", "BlackBerry Pearl"]:
    fig.add_trace(go.Scatter(
        x=data["Year"],
        y=data[phone],
        mode='lines+markers',
        name=phone
    ))

# Update layout
fig.update_layout(
    title="Sales of Different Phones (2006-2015)",
    xaxis_title="Year",
    yaxis_title="Sales",
    legend_title="Phone",
    template="plotly"
)

# Show the plot
fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
import polars as pl
import numpy as np

# Define the years and phones
years = list(range(2006, 2015 + 1))
phones = ["Nokia 1100", "Samsung E250", "LG Chocolate", "Motorola Razr", "BlackBerry Pearl"]

# Generate sales data with Nokia 1100 having slightly higher sales
sales_data = []

for year in years:
    # Random sales values for other phones
    sales_values = np.random.randint(10000, 30000, size=len(phones) - 1)
    
    # Slightly higher sales for Nokia 1100
    max_other_sales = max(sales_values)
    nokia_sales = int(max_other_sales * 1.07)  # 5% higher than the max of the others
    
    # Combine Nokia 1100 sales with others
    yearly_sales = [nokia_sales] + list(sales_values)
    
    for phone, sales in zip(phones, yearly_sales):
        sales_data.append((year, phone, sales))

# Create the DataFrame
df = pl.DataFrame({
    "Year": [data[0] for data in sales_data],
    "Phone": [data[1] for data in sales_data],
    "Sales": [data[2] for data in sales_data],
})

# Display the DataFrame
df

data = (df
 .pivot(on='Phone',
        values='Sales',
        index='Year'
        )
#  .plot.line(x='Year')
 )
import polars as pl
data = pl.read_parquet('old_phones.parquet')
import plotly.graph_objects as go

# Create the line plot
fig = go.Figure()

# Add a trace for each phone
for phone in ["Nokia 1100", "Samsung E250", "LG Chocolate", "Motorola Razr", "BlackBerry Pearl"]:
    color = 'blue' if phone == "Nokia 1100" else 'grey'
    fig.add_trace(go.Scatter(
        x=data["Year"],
        y=data[phone],
        mode='lines+markers+text',
        name=phone,
        line=dict(color=color),
        text=[None] * (len(data["Year"]) - 1) + [phone],  # Show text only at the last point
        textposition='top center',
        textfont_size=10.5
    ))

# Update layout
fig.update_layout(
    title="Sales of Different Phones (2006-2015)",
    xaxis_title="Year",
    yaxis_title="Sales",
    showlegend=False,  # Remove the legend
    template="plotly",
    height=600,
    width=1000,  # Increase the width of the figure
    margin=dict(r=150),  # Increase the right margin to avoid text being cut off
    paper_bgcolor="#FFE8D6",
    plot_bgcolor="#FFE8D6",
)

# Show the plot
fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
# Increase linewidth and remove grid lines

import plotly.graph_objects as go

# Create the line plot
fig = go.Figure()

# Add a trace for each phone
for phone in ["Nokia 1100", "Samsung E250", "LG Chocolate", "Motorola Razr", "BlackBerry Pearl"]:
    color = 'blue' if phone == "Nokia 1100" else 'grey'
    line_width = 4 if phone == "Nokia 1100" else 2  # Thicker line for Nokia 1100
    fig.add_trace(go.Scatter(
        x=data["Year"],
        y=data[phone],
        mode='lines+markers+text',
        name=phone,
        line=dict(color=color, width=line_width),  # Set the line width here
        text=[None] * (len(data["Year"]) - 1) + [phone],  # Show text only at the last point
        textposition='top center',
        textfont_size=10.5
    ))

# Update layout
fig.update_layout(
    title="Sales of Different Phones (2006-2015)",
    xaxis_title="Year",
    yaxis_title="Sales",
    showlegend=False,  # Remove the legend
    template="plotly",
    height=600,
    width=1000,  # Increase the width of the figure
    margin=dict(r=150),  # Increase the right margin to avoid text being cut off
    paper_bgcolor="#FFE8D6",
    plot_bgcolor="#FFE8D6",
    xaxis={'showgrid':False},
    yaxis={'showgrid':False},
)

# Show the plot
fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
import plotly.graph_objects as go

# Create the line plot
fig = go.Figure()

# Add a trace for each phone
for phone in ["Nokia 1100", "Samsung E250", "LG Chocolate", "Motorola Razr", "BlackBerry Pearl"]:
    color = 'blue' if phone == "Nokia 1100" else 'grey'
    line_width = 3 if phone == "Nokia 1100" else 2  # Thicker line for Nokia 1100
    text_color = 'blue' if phone == "Nokia 1100" else 'grey'  # Set text color for Nokia 1100
    fig.add_trace(go.Scatter(
        x=data["Year"],
        y=data[phone],
        mode='lines+markers+text',
        name=phone,
        line=dict(color=color, width=line_width),  # Set the line width here
        text=[None] * (len(data["Year"]) - 1) + [phone],  # Show text only at the last point
        textposition='bottom center',
        textfont=dict(size=10.5, color=text_color)  # Set text color here
    ))

# Update layout
fig.update_layout(
    title="<b>The amazing sales of Nokia 1100<br>(2006 - 2015)</b>",
    title_font=dict(size=25),  # Set title font size
    title_x=0.5,
    # xaxis_title="Year",
    yaxis_title="Sales",
    xaxis=dict(
        showgrid=False,  # Remove grid lines
        tickfont=dict(size=16, color="#3d3846"),  # Set x-axis label font size and color
    ),
    yaxis=dict(
        showgrid=False,  # Remove grid lines
        tickfont=dict(size=16, color="#3d3846"),  # Set y-axis label font size and color
    ),
    showlegend=False,  # Remove the legend
    template="plotly",
    height=600,
    width=1000,  # Increase the width of the figure
    margin=dict(r=150),  # Increase the right margin to avoid text being cut off
    paper_bgcolor="#FFE8D6",
    plot_bgcolor="#FFE8D6",
)

# Add an image (logo.png) at the bottom right of the plot
fig.add_layout_image(
    dict(
        source="logo.png",  # Path to the image
        xref="paper",  # Use paper coordinates (independent of data)
        yref="paper",
        x=1,  # Position at the right side (x=1 in paper coordinates)
        y=-0.13,  # Position at the bottom (y=0 in paper coordinates)
        xanchor="right",  # Anchor the image to the right
        yanchor="bottom",  # Anchor the image to the bottom
        sizex=0.2,  # Width of the image
        sizey=0.2,  # Height of the image
        opacity=1,  # Opacity of the image
        layer="above"  # Place image above the plot
    )
)

# Show the plot
fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
import plotly.graph_objects as go

# Create the line plot
fig = go.Figure()

# Add a trace for each phone
for phone in ["Nokia 1100", "Samsung E250", "LG Chocolate", "Motorola Razr", "BlackBerry Pearl"]:
    color = 'blue' if phone == "Nokia 1100" else 'grey'
    line_width = 4 if phone == "Nokia 1100" else 2  # Thicker line for Nokia 1100
    text_color = 'blue' if phone == "Nokia 1100" else 'grey'  # Set text color for Nokia 1100
    fig.add_trace(go.Scatter(
        x=data["Year"],
        y=data[phone],
        mode='lines+markers+text',
        name=phone,
        line=dict(color=color, width=line_width),  # Set the line width here
        text=[None] * (len(data["Year"]) - 1) + [phone],  # Show text only at the last point
        textposition='bottom center',
        textfont=dict(size=10.5, color=text_color)  # Set text color here
    ))

# Update layout
fig.update_layout(
    title="<b>The amazing sales of Nokia 1100<br>(2006 - 2015)</b>",
    xaxis_title="Year",
    yaxis_title="Sales",
    showlegend=False,  # Remove the legend
    template="plotly",
    height=600,
    width=1000,  # Increase the width of the figure
    margin=dict(r=150),  # Increase the right margin to avoid text being cut off
    paper_bgcolor="#FFE8D6",
    plot_bgcolor="#FFE8D6",
    xaxis={'showgrid': False},  # Remove grid lines
    yaxis={'showgrid': False},  # Remove grid lines
)

# Show the plot
fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
(used
#  .write_parquet('old_phones.parquet')
 )
(data)
shape: (10, 6)
Year Nokia 1100 Samsung E250 LG Chocolate Motorola Razr BlackBerry Pearl
i64 i64 i64 i64 i64 i64
2006 20882 11556 17774 10976 19516
2007 18135 13325 16917 16949 11911
2008 32077 26739 12999 29979 27518
2009 28274 12895 24870 25888 26425
2010 18156 16969 12296 11419 11066
2011 23833 22274 16032 15188 16077
2012 15002 11729 10197 13179 14021
2013 26106 15557 22936 17432 24399
2014 30401 28413 26105 12006 17926
2015 25791 12787 10366 24104 19683
used = data
used
shape: (10, 6)
Year Nokia 1100 Samsung E250 LG Chocolate Motorola Razr BlackBerry Pearl
i64 i64 i64 i64 i64 i64
2006 28933 27556 23945 12230 24750
2007 30547 19208 17206 29093 26823
2008 30926 14239 20094 24151 29454
2009 24224 23071 19134 15833 21404
2010 27221 25925 15484 22814 25203
2011 20302 12984 10218 19336 18414
2012 27979 14310 21136 17433 26647
2013 24574 16288 23404 19205 21133
2014 30382 10628 28936 25888 26215
2015 28839 14157 24244 11939 27466
import polars as pl
import polars.selectors as cs
import requests

# Define the sheet ID and the desired output Excel file name
sheet_id = '1UnLz40Our1Ids-O0sz26uPNCF6cQjwosrZQY4VLdflU'
url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=xlsx'

# Download the Excel file from Google Sheets
response = requests.get(url)
with open('google_sheet.xlsx', 'wb') as file:
    file.write(response.content)

# Read the specific tab from the downloaded Excel file
sheet_name = 'Fall 2015 Data'

cols = ['Timestamp','Company','Position','Job_Type','Location','Base_Salary','Pay_Period',
 'Housing_Stipend','Signing_Bonus','Stock_Benefits','Vesting_Details','Added_Benefits',
 'Negotiate','Accept_Offer','Weeks_Interview_Offer','Sex','Degree_Level','College_Year',
 'Last_School_Attended','Major','Added_Info','Monthly_Salary','Annualized_Salary','Yearly_Bonus',
 'Annualized_Recurring_Renumeration','Monthly_Recurring_Renumeration','First_Year_Annualized_Renumeration','Empty'
 ]

data = (pl.read_excel('google_sheet.xlsx',
                      engine='xlsx2csv',
                      sheet_name=sheet_name, read_options={'new_columns':cols, 'ignore_errors':True})
        .select(pl.exclude('Timestamp','Empty'))
        )
data

df = (data
 .with_columns(pl.col('Company').str.replace('Amazon.com','Amazon'),
               pl.col('Vesting_Details').str.strip_chars().str.replace('n/a',''))
 .with_columns(cs.by_dtype(pl.Utf8).fill_null('--'))
 )
df
import polars as pl
df = pl.read_parquet('salary_compensation.parquet')
df
# Highest base salary by job type by pay period.
(df
 .group_by('Job_Type','Pay_Period',)
 .agg(pl.max('Base_Salary'))
 .sort('Job_Type')
 )
# Average income by sex by job type.

(df
 .group_by('Sex','Job_Type')
 .agg(pl.mean('Annualized_Salary','Yearly_Bonus','Signing_Bonus'))
 )
# Average income by location by job type.

(df
 .group_by('Location','Job_Type')
 .agg(pl.mean('Annualized_Salary','Annualized_Recurring_Renumeration'))
 )
(df
 .with_columns(pl.col('College_Year').replace("Middler (3/5 years)", 'Freshman'))
 .group_by('College_Year').len()
 )
# Chance of being picked for each college year.
(df
 .with_columns(pl.col('College_Year').replace("Middler (3/5 years)", 'Freshman'))
 .group_by('College_Year').len()
 .with_columns(Probability=pl.col('len') / pl.col('len').sum())
 .sort('Probability', descending=True)
 )
(df
 .with_columns(pl.col('Major').str.replace_all(r'(?i)^CS',',Computer Science')
               .str.replace_all(r'(?i)^,CS',',Computer Science')
               .str.replace_all(r'/|\+|and',',')
               .str.replace_all('EECS','EE,CS')
               .str.replace_all(r'(?i)^EE', ',Electrical Engineering')
               .str.replace('MS CS','MS Computer Science')
               .str.replace(', CS',',Computer Science')
               .str.replace('Commuter','Computer')
               .str.replace(',CS',',Computer Science')
               .str.replace_all(r'^,','')
               .str.replace_all('Comp. Sci','Computer Science')
               .str.replace_all('at Community College','')
               .str.replace_all('compE','Computer Engineering')
               .str.strip_chars().str.to_titlecase()
               )
 .group_by('Major').first()
#  .filter(pl.col('Major').str.contains('Math'))
#  ['Major'].to_list()
 )
['Timestamp','Company','Position','Job_Type','Location','Base_Salary','Pay_Period',
 'Housing_Stipend','Signing_Bonus','Stock_Benefits','Vesting_Details','Added_Benefits',
 'Negotiate','Accept_Offer','Weeks_Interview_Offer','Sex','Degree_Level','College_Year',
 'Last_School_Attended','Major','Added_Info','Monthly_Salary','Annualized_Salary','Yearly_Bonus',
 'Annualized_Recurring_Renumeration','Monthly_Recurring_Renumeration','First_Year_Annualized_Renumeration','Empty'
 ]
# https://docs.google.com/spreadsheets/d/1UnLz40Our1Ids-O0sz26uPNCF6cQjwosrZQY4VLdflU/edit?gid=149178398#gid=149178398
sheet_id = '1UnLz40Our1Ids-O0sz26uPNCF6cQjwosrZQY4VLdflU'
sheet_name = 'Income_By_Sex'
url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:xlsx&sheet={sheet_name}'

import pandas as pd
pd.read_excel(url,
            #  has_header=True,
            )
def load_data():
    sheet_id = '1UnLz40Our1Ids-O0sz26uPNCF6cQjwosrZQY4VLdflU'
    sheet_name = 'Fall 2015 Data'
    url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}'

    cols = list(range(7))
    df = (pl.read_csv(url,
                has_header=False,
                columns=cols,
                skip_rows=1,
                new_columns=['Company','Ticker','Episode_Date','Gain_Since_Episode','NASDAQ_Gain','Multiple_of_Benchmark','Delta_vs_Benchmark'])
    .with_columns(Episode_Date=pl.col.Episode_Date.str.strptime(pl.Date,"%b %d, %Y"),
                Gain_Since_Episode=pl.col.Gain_Since_Episode.str.replace("%","").cast(pl.Float64),
                NASDAQ_Gain=pl.col.NASDAQ_Gain.str.replace("%","").cast(pl.Float64),
                Delta_vs_Benchmark=pl.col.Delta_vs_Benchmark.str.replace("%","").cast(pl.Float64))
    .with_columns(pl.when(pl.col('Multiple_of_Benchmark') >= 0)
                .then(pl.lit('Positive'))
                .when(pl.col('Multiple_of_Benchmark') < 0)
                .then(pl.lit('Negative'))
                .alias('Gain_or_Loss'))
    .with_columns(Performance=pl.col('Gain_Since_Episode').cut([-20, 0, 10, 50], labels=['Worst', 'Bad', 'Good', 'Better', 'Best']))
    .to_pandas()
    )
    return df

df = load_data()
import pandas as pd

df = pd.read_csv('https://github.com/chris1610/pbpython/blob/master/data/cereal_data.csv?raw=True')
df
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm

# Create figure and axis
fig, ax = plt.subplots(figsize=[6, 4])

# Set the background color of the figure
fig.patch.set_facecolor('#d6deeb')

# Set the background color of the axis
ax.set_facecolor('#d6deeb')

# Create a bell curve (Gaussian distribution)
x = np.linspace(-3, 3, 1000000)
y = norm.pdf(x, 0, 1)

# Apply a sinusoidal wave to the bell curve
y_zigzag = y * (1 + 0.7 * np.sin(30 * np.pi * x) * 1.4)

# Plot the zigzagged bell curve
ax.plot(x, y_zigzag, color='#696969')  # Change line color to contrast with background

# Remove axes and adjust position
ax.axis('off')
ax.set_position([0, 0, 1, 1])

# Save the image
# plt.savefig("bell_curve_zigzag.svg", bbox_inches='tight', pad_inches=0)
plt.show()
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=[6, 6])
x = np.arange(0, 100, 0.00001)
y = x*np.sin(2* np.pi * x)
plt.plot(y)
plt.axis('off')
plt.gca().set_position([0, 0, 1, 1])
plt.savefig("test.svg")
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

mean = 0  # Mean of the distribution
std_dev = 1  # Standard deviation of the distribution
x = np.linspace(mean - 2*std_dev, mean + 2*std_dev, 10000)
y = norm.pdf(x, mean, std_dev)
fig, ax = plt.subplots(facecolor='#d6deed')
ax.set_facecolor('#d6deed')
ax.plot(x, y, color='#1E90FF', lw=5)  # Set the curve color to blue
ci_lower, ci_upper = norm.interval(0.95, loc=mean, scale=std_dev)
ax.fill_between(x, y, where=(x >= ci_lower) & (x <= ci_upper), color='#d6deed')

# Calculate the y position of the mean to limit the axvline within the curve
y_mean = norm.pdf(mean, mean, std_dev)
ax.axvline(mean, color='black', ls='-.', lw=5, ymin=.945, ymax=0.1,
           solid_joinstyle='bevel'
           )

ax.spines[['top', 'left', 'right']].set_visible(False)
ax.spines['bottom'].set_position(('outward', -30))
ax.spines['bottom'].set_linewidth(5)
ax.tick_params(axis='y', which='major', left=False, labelleft=False)
ax.set_xticks([])

plt.show()
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm

plt.figure(figsize=[6, 6])

# Create a bell curve (Gaussian distribution)
x = np.linspace(-3, 3, 10000)
y = norm.pdf(x, 0, 1)

# Apply a tighter sinusoidal wave to the bell curve
y_zigzag = y * (1 + 0.2 * np.sin(50 * np.pi * x))  # Increased frequency and reduced amplitude

# Plot the zigzagged bell curve
plt.plot(x, y_zigzag, color='#1E90FF')

# Remove axes and save the image
plt.axis('off')
plt.gca().set_position([0, 0, 1, 1])
plt.savefig("bell_curve_zigzag_tight.svg")
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm

plt.figure(figsize=[6, 4])

# Create a bell curve (Gaussian distribution)
x = np.linspace(-3, 3, 1000000)
y = norm.pdf(x, 0, 1)

# Apply a sinusoidal wave to the bell curve
y_zigzag = y * (1 + 0.7 * np.sin(30 * np.pi * x)*1.4)

# Plot the zigzagged bell curve
plt.plot(x, y_zigzag, color='#00BFFF')
plt.background_color('#00BFFF')

# Remove axes and save the image
plt.axis('off')
plt.gca().set_position([0, 0, 1, 1])
plt.savefig("bell_curve_zigzag.svg")
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=[6, 6])
x = np.arange(0, 100, 0.00001)
y = x*np.sin(2* np.pi * x)
plt.plot(y)
plt.axis('off')
plt.gca().set_position([0, 0, 1, 1])
plt.savefig("test.svg")
import polars as pl

df = pl.read_clipboard()
df

df = (df
 .with_columns(cs.float().cast(pl.Float32),
               cs.integer().cast(pl.Int16))
#  .write_parquet(r"C:/Users/Jmutenge/OneDrive - searsseating.com/Desktop/ne/blog/datasets/gender_earnings.parquet")
 )
import polars as pl
import polars.selectors as cs
from pathlib import Path

df = pl.read_parquet(r"C:/Users/Jmutenge/OneDrive - searsseating.com/Desktop/ne/blog/datasets/gender_earnings.parquet")
df
from great_tables import GT, html

(
    GT(df, rowname_col='Year')
    .tab_header(title=html("<h4>Average earnings for men and women,<br>overall and by occupation</h4>"))
    .cols_label(All_Males=html('<b style="color: grey;">Men</b>'),
                All_Females=html('<b style="color: grey;">Women</b>'),
                Male_Busdrivers=html('<b style="color: grey;">Men</b>'),
                Female_Busdriver=html('<b style="color: grey;">Women</b>'),
                Male_Cashier=html('<b style="color: grey;">Men</b>'),
                Female_Cashier=html('<b style="color: grey;">Women</b>'),
                )
    .tab_spanner(label=html("<b>All</b>"), columns=['All_Males', 'All_Females'])
    .tab_spanner(label=html("<b>Busdrivers</b>"), columns=['Male_Busdrivers', 'Female_Busdriver'])
    .tab_spanner(label=html("<b>Cashiers</b>"), columns=['Male_Cashier', 'Female_Cashier'])
)
set_width = '100px'
width_dict = {col: set_width for col in df.columns}
from great_tables import GT, md, html

set_width = '100px'
width_dict = {col: set_width for col in df.columns}

(
    GT(df, rowname_col='Year')
    .tab_header(title=html("<h4>Average earnings for men and women,<br>overall and by occupation</h4>"))
    .tab_source_note(
        source_note=md("**Note**: Data is simulated. The units is guavas.")
    )
    .cols_label(All_Males=html('<b style="color: grey;">Men</b>'),
                All_Females=html('<b style="color: grey;">Women</b>'),
                Male_Busdrivers=html('<b style="color: grey;">Men</b>'),
                Female_Busdriver=html('<b style="color: grey;">Women</b>'),
                Male_Cashier=html('<b style="color: grey;">Men</b>'),
                Female_Cashier=html('<b style="color: grey;">Women</b>'),
                )
    .tab_spanner(label=html("<b>All</b>"), columns=['All_Males', 'All_Females'])
    .tab_spanner(label=html("<b>Busdrivers</b>"), columns=['Male_Busdrivers', 'Female_Busdriver'])
    .tab_spanner(label=html("<b>Cashiers</b>"), columns=['Male_Cashier', 'Female_Cashier'])
    .fmt_number(columns=cs.float(), decimals=1, use_seps=False)
    .cols_width(cases=width_dict)
)
# https://pbpython.com/text-cleaning.html
# https://www.youtube.com/watch?v=6Eu2b34alsE
# https://www.youtube.com/watch?v=GGL6U0k8WYA
# https://medium.com/codex/introduction-to-hovertemplate-in-plotly-graphs-e658c1b4a335
import polars as pl

data = pl.DataFrame({
    'Software': ['Lotus-123', 'WordStar', 'dBase III', 'VisiCalc', 
                 'WinZip', 'MS-DOS', 'HyperCard', 'WordPerfect'],
    'Units_Sold': [10000, 4500, 2500, 3000, 1800, 17000, 2200, 1900]
})
df = (data
 .sort('Units_Sold', descending=True)     
 .with_columns(Cum_Sum=pl.col('Units_Sold').cum_sum())
 .with_columns(Cum_Pct=pl.col('Cum_Sum').truediv(pl.col('Units_Sold').sum()))
 )
df
print(f"{df['Units_Sold'].sum():,}")
import plotly.graph_objects as go

# Create the bar chart
fig = go.Figure()

# Add bar chart for Units Sold
fig.add_trace(go.Bar(
    x=df['Software'],
    y=df['Units_Sold'],
    name='Units sold',
    marker_color='#008B8B',
    customdata=df['Cum_Pct']  # Pass Cum_Pct as customdata
))

# Add Pareto line for Cumulative Percentage
fig.add_trace(go.Scatter(
    x=df['Software'],
    y=df['Cum_Pct'],
    name='Cumulative Percentage',
    mode='lines+markers',
    marker_color='#ffffff',
    yaxis='y2'
))

# Update hover template to include Cum_Pct for the bar chart
hover_string_bar = '<b>Units Sold: </b> %{y:,} <br>'
hover_string_bar += '<b>Cum percent: </b> %{customdata:.2%}'
fig.update_traces(hovertemplate=hover_string_bar, selector=dict(type='bar'))

# Update hover template to show percentage for the scatter plot
hover_string_scatter = '<b>Software: </b> %{x}<br>'
hover_string_scatter += '<b>Cum percent: </b> %{y:.2%}'
fig.update_traces(hovertemplate=hover_string_scatter, selector=dict(type='scatter'))

# Update layout to have dual y-axes
fig.update_layout(
    title=dict(text='<b>Software units sold</b><br><b>Pareto chart</b>', font_size=22, pad={'t': 0.75}),
    yaxis=dict(
        title='<b>Units sold<b>',
        showgrid=False,  # Remove major and minor grid lines on the first y-axis
        tickformat=',',  # Format numbers with comma as thousand separator
    ),
    yaxis2=dict(
        title='<b>Cumulative percentage<b>',
        overlaying='y',
        color='#613583',
        side='right',
        tickformat='.0%',  # Round to the nearest whole
        showgrid=False  # Remove major and minor grid lines on the second y-axis
    ),
    xaxis=dict(
        showgrid=False  # Remove major and minor grid lines on the x-axis
    ),
    plot_bgcolor='#FFE4B5',
    paper_bgcolor='#FFE4B5',
    bargap=0.1,
    legend=dict(x=.25, y=1.07, orientation='h'),  # Change padding of legend with y
    font=dict(family='Inter')
)

fig.show()
import polars as pl
import plotly.express as px

data = (pl.read_parquet('draft/datasets/regional_sales.parquet')
      .rename(lambda col: col.replace(' ','_'))
      )
data
df = (data
 .with_columns(Sale_Pct=pl.col('Sales_In_Dollars') / pl.col('Sales_In_Dollars').sum())
 .fill_null('Other')
 )
df
hover_string = "<b>Segment: </b> %{label} <br>"
hover_string += "<b>Sales: </b> $%{value}"

fig = px.pie(data_frame=df, names='Segment',values='Sales_In_Dollars')
# fig.update_traces(hovertemplate=hover_string)
fig.show()
hovertemp = "<b>Segment: </b> %{x} <br>"
hovertemp += "<b>Sales: </b> $%{y}"

fig = px.bar(data_frame=df, x='Segment',y='Sales_In_Dollars', barmode='relative')
fig.update_traces(hovertemplate=hovertemp)
fig.show()
grouped = (df
 .group_by('Segment')
 .agg(pl.sum('Sale_Pct').mul(100),
      pl.sum('Sales_In_Dollars'))
 )
grouped
# Custom data
hover_string = "<b>Segment: </b> %{x} <br>"
hover_string += "<b>Sales: </b> $%{y} <br>"
hover_string += "<b>Percentage: </b> %{customdata[0]:,.2f}%"

fig = px.bar(data_frame=grouped, x='Segment',y='Sales_In_Dollars', custom_data=['Sale_Pct'], barmode='group',
             height=600)
fig.update_traces(marker_color='#D8BFD8')
fig.update_traces(hovertemplate=hover_string)
fig.update_layout(
    plot_bgcolor='#4B0082',
    paper_bgcolor='#4B0082',
    font_color="#ffffff",
    font_family='Inter',
    yaxis={'title':'<b>Sales (in $)</b>', 'tickfont': {'size': 18}},
    xaxis={'title':'<b>Segment</b>', 'tickfont': {'size': 18}}
)
fig.show()
import polars as pl
df = (pl.read_parquet('cleaned_liquor.parquet')
      .rename(lambda col: col.replace('(','').replace(')',''))
      # .filter(pl.col('Store_Name').is_in(["CASEY'S","HY-VEE","KUM & GO"]))
      )
df
(df
 .filter(pl.col('Date').dt.year() == 2023)
 .drop('County','Vendor_Name')
#  .write_parquet('cleaned_liquor_v2.parquet')
 )
(df
 .group_by('Category_Name')
 .agg(pl.sum('Bottles_Sold'))
 .sort('Bottles_Sold')
 ['Category_Name'][0]
 )
total_sales = (df
 ['Sale_Dollars'].sum()
 )

f"{total_sales:,.2f}"
df['Store_Name'].to_pandas().unique()
heat_map = (df
 .select('Store_Name',pl.col('Date').dt.year(),'Sale_Dollars')
 .group_by('Store_Name','Date')
 .agg(pl.sum('Sale_Dollars'))
 .sort('Date')
#  .to_pandas()
 )
heat_map
heat_map.select('Store_Name','Sale_Dollars')
df_tree_map = (df
 .group_by('Store_Name')
 .agg(pl.sum('Sale_Dollars'))
 )
df_tree_map
import polars as pl
(pl.read_excel(r"C:/Users/Jmutenge/Downloads/2018_Sales_Total.xlsx")
 .with_columns(pl.col('quantity').cast(pl.Int8),
               pl.col('unit price','ext price').cast(pl.Float32),)
            #    pl.col('name').cast(pl.Categorical))
 .write_parquet(r"C:/Users/Jmutenge/Downloads/sales_total_2018.parquet")
#  ['ext price'].n_unique()
 )
import plotly.express as px
fig = px.treemap(df_tree_map,
                 path=['Store_Name'],
                 values='Sale_Dollars',
                 custom_data=['Sale_Dollars'])
fig.update_traces(
    hovertemplate='<b>%{label}</b><br>Sales: $%{customdata[0]:,.0f}<extra></extra>'
)
fig.update_layout(
    title={'text': "<b>Store Sales Treemap</b>", 'y':0.9, 'x':0.5,
           'xanchor':'center', 'yanchor':'top', 'font_size':22},
    plot_bgcolor='#D8BFD8',
    paper_bgcolor='#D8BFD8',
    margin={'t': 50, 'b': 10, 'l': 10, 'r': 10},
    font_family="Inter",
)
fig.show()
bar_plot = (df
 .group_by(pl.col('Date').dt.strftime("%B").alias('Month'),pl.col('Date').dt.month().alias('Month_Num'),'Store_Name')
 .agg(pl.sum('Sale_Dollars'))
 .filter(pl.col('Store_Name') == "FAREWAY")
 .sort('Month_Num')
 )
bar_plot
import plotly.express as px

fig = px.bar(bar_plot, x='Month', y='Sale_Dollars', height=600, custom_data=['Sale_Dollars'])
fig.update_traces(marker_color='#6A5ACD')
fig.update_layout(
    font_family="Roboto",
    title={'text': f'<b>{bar_plot['Store_Name'][0].capitalize()} total monthly sales</b>', 'font_size':30, 'pad': {'t': 0.75}},
    plot_bgcolor='#D8BFD8',
    paper_bgcolor='#D8BFD8',
    legend_title=None,
    bargap=0.1,
    font_color="#000000",
    title_font_color="#000000",
    margin={'t': 100, 'b': 3, 'l': 10, 'r': 10},
    xaxis={'title':'', 'tickfont': {'size': 16}},
    yaxis={'title':'<b>Sales (in $)</b>', 'tickfont': {'size': 16}}
)
fig.update_traces(hovertemplate='<b>%{label}</b><br>Sales: $%{customdata[0]:,.0f}<extra></extra>')

fig.show()
line_plot = (df
 .select('Date','Store_Name','Sale_Dollars')
 .group_by(pl.col('Date').dt.year().alias('Year'), pl.col('Date').dt.month().alias('Month'),'Store_Name')
 .agg(pl.mean('Sale_Dollars'))
 .sort('Year','Month')
 .filter(pl.col('Year') == 2019)
#  .filter(pl.col('Store_Name') == "WALMART")
 )
line_plot
# Create heatmap using Plotly
fig = px.line(line_plot, x='Month', y="Sale_Dollars", color='Store_Name')
fig.update_xaxes(dtick=1)
fig.show()
donut_plot = (df
 .group_by('Store_Name', pl.col('Date').dt.year())
 .agg(pl.sum('Volume_Sold_Liters'))
 .filter(pl.col('Date') == 2023)
 )
donut_plot
fig = px.pie(donut_plot, names="Store_Name", values="Volume_Sold_Liters", hole=.5)
fig.show()
(df
 .group_by('Store_Name')
 .agg(pl.sum('Sale_Dollars'))
 .plot.bar(x='Store_Name', y='Sale_Dollars')
 )
import polars as pl
acq = pl.read_csv('Acquired_raw.csv', try_parse_dates=True)

cols = ['release_date','season','episode_name','parent_company','company','Industry','founded_year',
        'headquarter_country','headquarter_state','Topic','transaction_year','acquisition_value',
        'grading_ben','grading_david','description','duration_ms','Scale Economies','Network Economies',
        'Counter Positioning','Switching Costs','Branding','Cornered Resource','Process Power']

(acq
 .select(cols)
 .rename(lambda c: c.replace('headquarter','hq').replace(' ','_').lower())
 .with_columns(pl.col(['grading_ben','grading_david']).replace('NULL', None).str.replace(' minus','-').str.replace(' plus','+'),)
 .with_columns(t_sec=pl.col('duration_ms') // 1000,
               t_min=pl.col("duration_ms") // (1000 * 60),
               t_hr=pl.col("duration_ms") // (1000 * 60 * 60))
 .with_columns(second=pl.col('t_sec') % 60,
               minute=(pl.col('t_sec') // 60) % 60,
               hour=pl.col('t_sec') // 3600)
 .with_columns(duration=pl.time('hour', 'minute', 'second').alias('time'))
 .drop('duration_ms','t_sec','t_min','t_hr','second','minute','hour')
 .sort('duration')
 )
import polars as pl

bom = pl.read_clipboard()
bom
(bom
 .filter(pl.col('Item Number »').is_not_null())
 .filter(~pl.col('Description').str.contains('LOCTITE'))
 .filter(pl.col('Qty/Assy').is_not_null())
#  ['UM'].unique()
 )
import pandas as pd

df = (pd.read_excel(r"C:/Users/Jmutenge/Downloads/2018_Sales_Total.xlsx")
 .rename(columns=lambda col: col.replace(' ', '_'))
 )
df.sample(6)
(df
 .set_index('date')
 .resample('ME')
 [['ext_price']]
 .sum()
 .assign(pct_total=lambda df_: df_.ext_price / df_.ext_price.sum())
 .rename(columns={'ext_price':'sum'})
 .reset_index()
 .style
 .format({'date':'{:%b-%Y}','sum':'${0:,.0f}','pct_total':'{:.2%}'})
 .hide()
 )
#ADVANCED
from sparklines import sparklines
import numpy as np

# function to create sparklines
def sparkline(x):
    bins=np.histogram(x)[0]
    sl = ''.join(sparklines(bins))
    return sl
sparkline.sparklines = 'sparkline'

# apply the function to modify the dataframe
data = (df
 .groupby('name')
 .agg({'quantity': ['mean', 'median', sparkline], 'ext_price': ['mean', 'median', 'sum', sparkline]})
 .head(10)
 )
data.columns.names=['','Stats']

# Mapping for renaming
rename_mapping = {
    'quantity': 'Quantity',
    'ext_price': 'Price'
}

# Renaming the levels
data.columns = data.columns.set_levels(
    [rename_mapping.get(item, item) for item in data.columns.levels[0]], level=0
)

data
cell_hover = {  # for row hover use <tr> instead of <td>
    'selector': 'td:hover',
    'props': [('background-color', '#ffffb3')]
}
index_names = {
    'selector': '.index_name',
    'props': 'font-style: italic; color: darkgrey; font-weight:normal;'
}
headers = {
    'selector': 'th:not(.index_name)',
    'props': 'background-color: #813d9c; color: white;'
}
(data
 .style
 .format({('Quantity', 'mean'): "{:.2f}",
          ('Quantity', 'median'): "{:.2f}",
          ('Price', 'mean'): "{:.2f}",
          ('Price', 'median'): "{:.2f}",
          ('Price', 'sum'): "{:.2f}"})
 .set_table_styles([cell_hover, index_names, headers])
 .set_table_styles([{'selector': 'th.col_heading', 'props': 'text-align: center;'},
                    {'selector': 'th.col_heading.level0', 'props': 'font-size: 2em;'},
                    {'selector': 'td', 'props': 'text-align: center; font-weight: bold;'}], overwrite=False)
 )
(df
 .groupby('name')
 .agg({'quantity': ['mean', 'median'], 'ext_price': ['mean', 'median', 'sum']})
 .sort_values(('ext_price', 'sum'), ascending=False)
 .head(10)
)
# Get mean and sum
(df
 .groupby('name')
 .ext_price
 .agg({'mean','sum'})
 .sample(6)
 )
(df
 .groupby('name')
 .ext_price
 .agg({'mean','sum'})
 .sample(6)
 .style
 .format('${0:,.2f}')
 )
# Sales by month
(df
 .groupby(pd.Grouper(key='date', freq='ME'))
 .ext_price
 .agg('sum')
 .to_frame()
 .assign(pct_total=lambda df_: df_.ext_price / df_.ext_price.sum())
 .rename(columns={'ext_price':'sum'})
#  .reset_index()
#  .style
#  .format({'date':'{:%b-%Y}','sum':'${0:,.0f}','pct_total':'{:.2%}'})
#  .hide()
 )
(df
 .set_index('date')
 .resample('ME')
 [['ext_price']]
 .sum()
 .assign(pct_total=lambda df_: df_.ext_price / df_.ext_price.sum())
 .rename(columns={'ext_price':'sum'})
 .reset_index()
 .style
 .format({'date':'{:%b-%Y}','sum':'${0:,.0f}','pct_total':'{:.2%}'})
 .hide()
 )
(df
 .set_index('date')
 .resample('ME')
 [['ext_price']]
 .sum()
 .assign(pct_total=lambda df_: df_.ext_price / df_.ext_price.sum())
 .rename(columns={'ext_price':'sum'})
 .reset_index()
 .style
 .format({'date':'{:%b-%Y}','sum':'${0:,.0f}','pct_total':'{:.2%}'})
 .hide()
 .highlight_max(color='darkgreen', subset=['sum','pct_total'])
 .highlight_min(color='red', subset=['sum','pct_total'])
 )
(df
 .set_index('date')
 .resample('ME')
 [['ext_price']]
 .sum()
 .assign(pct_total=lambda df_: df_.ext_price / df_.ext_price.sum())
 .rename(columns={'ext_price':'sum'})
 .reset_index()
 .style
 .format({'date':'{:%b-%Y}','sum':'${0:,.0f}','pct_total':'{:.2%}'})
 .hide()
 .background_gradient(cmap='BuGn', subset='sum')
 )
# Styling with bars
(df
 .set_index('date')
 .resample('ME')
 [['ext_price']]
 .sum()
 .assign(pct_total=lambda df_: df_.ext_price / df_.ext_price.sum())
 .rename(columns={'ext_price':'sum'})
 .reset_index()
 .style
 .format({'date':'{:%b-%Y}','sum':'${0:,.0f}','pct_total':'{:.2%}'})
 .hide()
 .bar(color='#ffa348', vmin=100_000, subset=['sum'], align='zero')
 .bar(color='#c061cb', vmin=0, subset=['pct_total'], align='zero')
 .set_caption('<h2>Sales performance (2018)</h2>')
 )
data
# Advanced styling
data.columns.set_levels(['Quantity','Price'], level=1)
data.columns.set_levels(
    data.columns.levels[0].str.replace('ext_price', 'Price'), level=0
)
data.columns = data.columns.set_levels(
    data.columns.levels[0].str.replace('ext_price', 'Price'), level=0
)
data
import polars as pl
pl.read_csv(r"C:/Users/Jmutenge/OneDrive - searsseating.com/Desktop/ne/javascript/output.csv").write_excel(r"C:/Users/Jmutenge/OneDrive - searsseating.com/Desktop/CN.xlsx")
df.filter(pl.col('Date').str.contains('October'))
92*4
import random

min_num = 1
max_num = 100
answer = random.randint(min_num, max_num)

attempts = 0
running = True

while running:
    try:
        guess = int(input(f"Guess a number between {min_num} - {max_num}: "))
        
        if guess < min_num or guess > max_num:
            print(f"Please enter a valid number between {min_num} and {max_num}.")
        else:
            attempts += 1
            if guess < answer:
                print("TOO LOW! TRY AGAIN!")
            elif guess > answer:
                print("TOO HIGH! TRY AGAIN!")
            else:
                print(f"CORRECT! The answer was {answer}. It took you {attempts} attempts.")
                running = False
    except ValueError:
        print("Please enter a valid number.")
# From Matt Harrison - what I learned from his vids
import polars as pl

pl.col('a').replace(0, None)

df.to_pandas(use_pyarrow_extension_array=True)

(df
 .plot.scatter(yformatter='%.0f')
 .opts(jitter=.8)
 )
import polars as pl

df = pl.DataFrame({
    'Party': ['Australian Greens', 'Australian Labor Party', 'Liberal', 'The Nationals', 'Other Candidates'],
    'Seats_Won': [3, 55, 21, 6, 3]
})
df_pandas = df.sort('Seats_Won').to_pandas()

from matplotlib import pyplot as plt
plt.rc('font', size=7)

fig, ax = plt.subplots(figsize=(6,4), facecolor='#F8F8FF', dpi=120)
ax.spines[['left','top','right']].set_visible(False) #turn off all spines
ax.set_facecolor('#F8F8FF')

ax.barh('Party', 'Seats_Won', data=df_pandas, color=['#808080', "#10C25B", "#006644", "#0047AB", "#DE3533"])
ax.set_ylabel('Party', fontdict={'size':10})
ax.set_xlabel('Number of seats won', fontdict={'size':10}, labelpad=5)
ax.xaxis.grid(linestyle='--')
fig.suptitle('Victorian election 2018 lower house results', fontsize=13, weight=800, y=.93)

# Adding the vertical line
ax.axvline(x=44, ymin=0, ymax=0.8, color='k', linestyle='--', linewidth=1.5)

# Adding the text to the right of the vertical line
ax.text(44 + 1, 3, 'majority of\nparliament', ha='left', va='center', fontsize=9, color='k')

fig.text(0.804, -0.05, 'Data source: Victorian Electoral Commission', ha='right', fontsize=8, va='bottom')
plt.show();
import polars as pl

cols = ['Date','Store_Number','Store_Name','Address','City',
        'County','Category_Name','Vendor_Name','Item_Description','Pack',
        'Bottle_Volume_(ml)','State_Bottle_Cost','Bottles_Sold','Sale_(Dollars)',
        'Volume_Sold_(Liters)',
        ]

df = (pl.read_csv('Iowa_Liquor_Sales_20240727.csv', try_parse_dates=True, ignore_errors=True)
 .rename(lambda col: col.replace(' ','_').replace('/','_'))
 .with_columns(pl.col('Date').str.strptime(pl.Date, "%m/%d/%Y"))
 .with_columns(Year=pl.col('Date').dt.year())
 .filter(pl.col('Year').is_between(2019,2023))
 .select(cols)
 )
df
map_stores = {'WAL-MART':'WALMART',
              'HY-VEE':'HY-VEE',
              'CENTRAL CITY':'CENTRAL CITY LIQUOR',
              'SMOKIN':"SMOKIN' JOE'S",
              'FAREWAY':'FAREWAY',
              'CASEY':"CASEY'S",
              "SAM'S":"SAM'S CLUB",
              'KUM':"KUM & GO",
              'CVS':'CVS PHARMACY',
              'WALGREENS':'WALGREENS',
              'YESWAY':'YESWAY',
              'TARGET':'TARGET',
              'HOMETOWN':'HOMETOWN FOODS',
              'KWIK':'KWIK STOP',
              'CIRCLE K':'CIRCLE K',
              "BUCKY":"BUCKY'S EXPRESS",
              }
dds = []
for key, value in map_stores.items():
    dd = (df.with_columns(pl.when(pl.col('Store_Name').str.contains(key))
                    .then(pl.lit(value))
                    .otherwise(pl.col('Store_Name'))
                    .alias('Store_Name'))
          .filter(pl.col('Store_Name') == value)
    )
    dds.append(dd)
    
fin_df = pl.concat(dds)
fin_df
(fin_df
 .with_columns(pl.col('Volume_Sold_(Liters)','Sale_(Dollars)','State_Bottle_Cost').cast(pl.Float32),
               pl.col('Store_Number','Bottle_Volume_(ml)').cast(pl.Int16),
               pl.col('Pack').cast(pl.Int8),
               pl.col('Bottles_Sold').cast(pl.Int16))
#  .write_parquet('cleaned_liquor.parquet')
#  ['Bottles_Sold'].max()
 )
(df
 .with_columns(pl.when(pl.col('Store_Name').str.contains('WAL-MART'))
               .then(1)
               .otherwise(0)
               .alias('Check')
               )
 .with_columns(pl.when(pl.col('Check') == 1)
               .then(pl.col('Store_Name') == 'WALMART')
               .otherwise(pl.col('Store_Name'))
               .alias('Store_Name')
               )
#  .drop('Check')
 .filter(pl.col('Store_Name') == 'WALMART')
 )
dfs = []
for key, values in map_stores.items():
    for value in values:
        dd = (df
                  .with_columns(pl.when(pl.col('Store_Name').str.contains(key))
                                .then(values)
                                .otherwise(pl.col('Store_Name'))
                                )
                 )
        dfs.append(dd)
dfs = []
for keys, values in map_stores.items():
    for key in keys:
        for value in values:
            dd = (df
            .with_columns(pl.when(pl.col('Store_Name').str.contains(key))
                        .then(value)
                        .otherwise(pl.col('Store_Name')))
            )
            dfs.append(dd)
    
final_df = pl.concat(dfs)
final_df
(df
#  .with_columns(pl.col('Store_Name').replace(map_stores))
 .filter(pl.col('Store_Name').str.contains("GREAT WALL"))
 )
df.columns
# IOWA LIQUOR
liquor = pl.read_parquet('iowa_liquor_sales.parquet', try_parse_hive_dates=True)
liquor.head(2)
(liq
 .filter(pl.col('name').str.contains('ASHWORTH'))
 )
liquor['city'].n_unique()
pl.read_csv('https://data.iowa.gov/resource/m3tr-qhgy.csv$limit=10000', try_parse_dates=True)#.write_parquet('iowa_liquor_sales.parquet')
df = pl.read_clipboard()
(df
 .rename({'Region':'Region Name','Sales':'Sales In Dollars'})
 .with_columns(pl.col('Sales In Dollars').str.replace(r'\$','').str.strip_chars().str.replace(',','').cast(pl.Float32).cast(pl.Int32))
 .write_parquet('regional_sales.parquet')
 )
import polars as pl
df = pl.read_parquet('regional_sales.parquet')
df.sample(5)
df.select(pl.col.Segment)
(df
 .rename({'Customer ID':'Customer_ID',
          'Customer Name':'Customer_Name',
          'Region Name':'Region_Name',
          'Sales In Dollars':'Sales_In_Dollars'})
)
# method 1
(df
 .select(pl.all().name.map(lambda col: col.replace(' ', '_')))
 )
# method 2
(df
 .rename(lambda col: col.replace(' ', '_'))
 )
# manually
df.columns = ['Customer_ID', 'Customer_Name', 'Region_Name', 'Segment', 'Sales_In_Dollars']
['Customer_ID', 'Customer_Name', 'Region_Name', 'Segment', 'Sales_In_Dollars']
df
import polars as pl
from faux_lars import generate_lazyframe

rows =5000
df = (
    generate_lazyframe(
        {
            "clicks": pl.UInt8,
            "Name": "name",
            "City Name": "city_name",
            "Phone": "mobile_number",
            "Sex": 
        },
        rows,
        "en",
    ).collect()
)
df
# Getting month and day names from datetime with polars
import polars as pl
df = pl.read_parquet(r"C:/Users/Jmutenge/OneDrive - searsseating.com/Desktop/ne/blog/posts/062024/last_week_tonight.parquet")
df
(df
 .select('text','time_parsed')
 .with_columns(Datetime=pl.from_epoch('time_parsed'))
 .drop('time_parsed')
 .with_columns(Weekday=pl.col('Datetime').dt.weekday(),
               Month=pl.col('Datetime').dt.month())
 )
(df
 .select('text','time_parsed')
 .with_columns(Datetime=pl.from_epoch('time_parsed'))
 .drop('time_parsed')
 )
# Extract day and month names (short and long)
(df
 .select('text','time_parsed')
 .with_columns(Datetime=pl.from_epoch('time_parsed'))
 .drop('time_parsed')
 .with_columns(Day_Short=pl.col('Datetime').dt.strftime('%a'),
               Day_Long=pl.col('Datetime').dt.strftime('%A'),
               Month_Short=pl.col('Datetime').dt.strftime('%b'),
               Month_Long=pl.col('Datetime').dt.strftime('%B'))
 )
# Why need short names
(df
 .select('text','time_parsed')
 .with_columns(Datetime=pl.from_epoch('time_parsed'))
 .drop('time_parsed')
 .with_columns(Day_Short=pl.col('Datetime').dt.strftime('%a'),
               Day_Long=pl.col('Datetime').dt.strftime('%A'),
               Month_Short=pl.col('Datetime').dt.strftime('%b'),
               Month_Long=pl.col('Datetime').dt.strftime('%B'))
 .group_by('Day_Long').len()
 .to_pandas()
 .plot.bar(x='Day_Long', y='len', rot=0, width=.85)
 )
(df
 .select('text','time_parsed')
 .with_columns(Datetime=pl.from_epoch('time_parsed'))
 .drop('time_parsed')
 .with_columns(Day_Short=pl.col('Datetime').dt.strftime('%a'),
               Day_Long=pl.col('Datetime').dt.strftime('%A'),
               Month_Short=pl.col('Datetime').dt.strftime('%b'),
               Month_Long=pl.col('Datetime').dt.strftime('%B'))
 .group_by('Day_Short').len()
 .to_pandas()
 .plot.bar(x='Day_Short',
           y='len',
           rot=0,
           width=.85,
           legend=False,
           color='#dc8add',
           xlabel='',
           title='Total number of comments for each week day',
           figsize=(8,4))
 );
# If you can't wrap your head around how to extract day or week names, use this library.
import polars_xdt as xdt

(df
 .select('text','time_parsed')
 .with_columns(Datetime=pl.from_epoch('time_parsed'))
 .drop('time_parsed')
 .with_columns(Weekday=xdt.day_name('Datetime'),
               Month=xdt.month_name('Datetime'))
 )
# Even cooler, other languages for day names
(df
 .select('text','time_parsed')
 .with_columns(Datetime=pl.from_epoch('time_parsed'))
 .drop('time_parsed')
 .with_columns(Weekday=xdt.day_name('Datetime'),
               French_Weekday=xdt.day_name('Datetime', locale='fr_FR'),
               Ukranian_Weekday=xdt.day_name('Datetime', locale='uk_UA'))
 )
# If we wanted to make that plot with short weekday names
(df
 .select('text','time_parsed')
 .with_columns(Datetime=pl.from_epoch('time_parsed'))
 .drop('time_parsed')
 .with_columns(Weekday=xdt.day_name('Datetime'))
 .with_columns(Weekday_Short=pl.col('Weekday').str.slice(0,3))
 )

Group_by()

import polars as pl
pl.__version__
orders_df = (pl.read_csv('https://raw.githubusercontent.com/FBosler/Medium-Data-Exploration/master/order_leads.csv',
             try_parse_dates=True)
 .rename(lambda c: c.replace(' ','_'))
 )
orders_df.head()
sales_df = (pl.read_csv('https://raw.githubusercontent.com/FBosler/Medium-Data-Exploration/master/sales_team.csv')
 .rename(lambda c: c.replace(' ','_'))
 )
sales_df
# Join datasets
df = (orders_df
 .join(sales_df, left_on=['Company_Id','Company_Name'], right_on=['Company_Id','Company_Name'])
 )
(df
 .group_by('Sales_Rep')
 )
# https://pbpython.com/sidetable.html

import pandas as pd
import sidetable

df = pd.read_csv('https://github.com/chris1610/pbpython/blob/master/data/school_transform.csv?raw=True', index_col=0)
df
df.stb.freq(['State'])
df.stb.freq(['State'], thresh=50)
df.stb.freq(['State'], thresh=50, other_label='Rest of states')
df.stb.freq(['Region', 'Model Selected'])
df.stb.freq(['Region'], value='Award_Amount')
df.stb.freq(['Region', 'Model Selected'],
             value='Award_Amount', thresh=82,
             other_label='Remaining')
df.stb.freq(['Region'], value='Award_Amount', style=True)
df.stb.missing()
# https://pbpython.com/selecting-columns.html
# https://pbpython.com/pandas-qcut-cut.html
# https://pbpython.com/styling-pandas.html
# https://pbpython.com/pandas-crosstab.html
# https://pbpython.com/pathlib-intro.html
# https://pbpython.com/market-basket-analysis.html

# https://github.com/mjschillawski?tab=repositories
# https://github.com/ritikabhasker/data
import polars as pl
from glob import glob
import re

# Get a list of all text files in the folder
txt_files = [f for f in glob(r"C:/Users/Jmutenge/Downloads/names/*.txt")]

dfs = []
for file in txt_files:
    # Extract the year from the filename
    year = re.search(r'\d{4}', file).group()
    
    # Read the CSV and add the 'Year' column
    df = pl.read_csv(file, new_columns=['Name', 'Sex', 'Count'])
    df = df.with_columns(pl.lit(year).alias('Year'))
    
    dfs.append(df)

# Concatenate all DataFrames
data = pl.concat(dfs)

data
grouped_df = (data
 .with_columns(pl.col('Year').cast(pl.Int16),
               pl.col('Count').cast(pl.Int32))
 .with_columns(Decade=pl.col('Year').floordiv(10)*10)
 .group_by('Name','Sex','Decade')
 .agg(pl.sum('Count'))
 )
dfs = []
for year in grouped_df['Decade'].unique().to_list():
    df = (grouped_df
    .filter(pl.col('Sex') == 'M')
    .filter(pl.col('Decade') == year)
    .sort('Count', descending=True)
    .top_k(10, by='Count')
    .with_columns(pl.col('Count').floordiv(1000))
    )
    dfs.append(df)
final_dd = pl.concat(dfs)
(pl.concat([final_df, final_dd])
 .write_csv(r"C:/Users/Jmutenge/Downloads/names/baby.csv")
 )
(data
 .with_columns(pl.col('Year').cast(pl.Int16),
               pl.col('Count').cast(pl.Int32))
 .with_columns(Decade=pl.col('Year').floordiv(10)*10)
 .group_by('Name','Sex','Decade')
 .agg(pl.sum('Count'))
 .filter(pl.col('Sex') == 'F')
 .filter(pl.col('Decade') == 1920)
 .sort('Count', descending=True)
 .top_k(10, by='Count')
#  .with_columns(pl.col('Count').floordiv(1000))
#  .sort()
#  .filter(pl.col('Count') == 0)
#  .filter((pl.col('Decade') == 1920) & (pl.col('Sex') == 'M'))
#  .top_k(10, by='Count')
#  .group_by('Sex','Decade')
#  .agg(pl.first('Name','Count'))
#  .sort('Sex','Count', descending=True)
#  .filter(pl.col('Sex') == 'F')
#  .filter(pl.col('Year').is_between(1935,2020))
#  .write_json(r"C:/Users/Jmutenge/Downloads/names/baby.json")
 )
(data
 .with_columns(pl.col('Year').cast(pl.Int16),
               pl.col('Count').cast(pl.Int32))
 .with_columns(Decade=pl.col('Year').floordiv(10)*10)
 .filter(pl.col('Name') == 'Princetta')
 .filter(pl.col('Decade') == 1960)
 ['Count'].sum()
#  .with_columns(Name_Count=pl.col('Name').count().over('Decade'))
#  .filter(pl.col('Year').is_between(2018,2020))
#  .group_by('Name','Decade')
#  .agg(pl.sum('Count'))
#  .filter(pl.col('Year').is_between(1935,2020))
#  .write_parquet(r"C:/Users/Jmutenge/Downloads/names/baby.parquet")
 )
```{ojs}
//| echo: false
html`<svg width=${700} height=${400} style="background-color: black;">
  ${Plot.plot({
    marks: [
      Plot.dot(cars, { x: "power (hp)", y: "economy (mpg)" })
    ],
    x: "power (hp)",
    y: "economy (mpg)",
    width: 700,
    height: 400
  })}
  <style>
    text {
      fill: yellow;
      font-size: 15px;
      font-family: 'Roboto', sans-serif;
    }
  </style>
</svg>`

```
import polars as pl

df = pl.DataFrame({
    'Party': ['Australian Greens', 'Australian Labor Party', 'Liberal', 'The Nationals', 'Other Candidates'],
    'Seats_Won': [3, 55, 21, 6, 3]
})

df
# Default plot
df.plot.barh(x='Party')
# Fix the ordering of bars
(df
 .sort('Seats_Won')
 .plot.barh(x='Party')
 )
# Convert dataframe to pandas
df_pandas = df.sort('Seats_Won').to_pandas()
df_pandas
from matplotlib import pyplot as plt
plt.rc('font', size=12)

fig, ax = plt.subplots(figsize=(12,6), facecolor='#F8F8FF', dpi=500)
ax.spines[['left','top','right']].set_visible(False) #turn off all spines
ax.set_facecolor('#F8F8FF')

ax.barh('Party', 'Seats_Won', data=df_pandas, color=['#808080', "#10C25B", "#006644", "#0047AB", "#DE3533"])
ax.set_ylabel('Party', fontdict={'size':18})
ax.set_xlabel('Number of seats won', fontdict={'size':18}, labelpad=5)
ax.xaxis.grid(linestyle='--')
fig.suptitle('Victorian election 2018 lower house results', fontsize=24, weight=800, y=.93)

# Adding the vertical line
ax.axvline(x=44, ymin=0, ymax=0.8, color='k', linestyle='--', linewidth=2)

# Adding the text to the right of the vertical line
ax.text(44 + 1, 3, 'majority of\nparliament', ha='left', va='center', fontsize=14, color='k')

fig.text(0.76, -0.028, 'Data source: Victorian Electoral Commission', ha='right', fontsize=14, va='bottom')
plt.show();
# Colors for each party
colors = ['#808080', '#10C25B', '#006644', '#0047AB', '#DE3533']

plt.rc('font', size=12)

fig, ax = plt.subplots(figsize=(12, 6), facecolor='#F8F8FF', dpi=500)
ax.spines[['left', 'top', 'right']].set_visible(False)  # turn off all spines
ax.set_facecolor('#F8F8FF')

# Plotting the lollipop chart
y = df_pandas['Party']
x = df_pandas['Seats_Won']

for i in range(len(x)):
    ax.hlines(y[i], 0, x[i], color=colors[i], linestyle='-', linewidth=4)  # lines
    ax.plot(x[i], y[i], 'o', color=colors[i], ms=20)  # circles at the end

ax.set_ylabel('Party', fontdict={'size': 18})
ax.set_xlabel('Number of seats won', fontdict={'size': 18}, labelpad=20)
ax.xaxis.grid(linestyle='--')
fig.suptitle('Victorian election 2018 lower house results', fontsize=24, weight=800, y=0.93)

# Removing y-axis tick marks and reducing the gap between tick labels and the y-axis
ax.tick_params(axis='y', which='both', length=0)  # Remove tick marks
ax.yaxis.set_tick_params(pad=-10)  # Reduce the gap between tick labels and the y-axis

fig.text(0.76, -0.07, 'Data source: Victorian Electoral Commission', ha='right', fontsize=14, va='bottom')
plt.show()