#!pip install bs4
#!pip install censusdata
#!pip install datashader
#!pip install numpy
#!pip install pandas
#!pip install plotly
#!pip install requests
#!pip install urllib3
#!pip install fastparquet
#!pip install cufflinks


import pandas as pd
import censusdata


# this will take ~10 seconds.
!pip install censusdata


help(censusdata.search)


sample = censusdata.search('acs5', 2015, 'concept', 'transportation')


#sample


sample[0]


sample[0][1]


sample[0:5]


sample[-0]


sample[-2:-6]


sample[-0:6]


censusdata.printtable(censusdata.censustable('acs5', 2015, 'B23025'))


states = censusdata.geographies(censusdata.censusgeo([('state', '*')]), 'acs5', 2015)
states


colorado = 8 #Pay attention to the error we get
#colorado = '08'
counties = censusdata.geographies(censusdata.censusgeo([('state', colorado), ('county', '*')]), 'acs5', 2015)


for i in range(2,7):
    print(i)


my_list = ['My', 'cool', 'list', 'of', 'words']
for i in my_list:
    print(i)


a = 5
b = 'five'
print(f'a={a}   b={b} combined they are {str(a)+b}')


for key, val in counties.items():
    print(f'key: {key}      value: {val}')


counties.keys()


counties.values()


county_list = list(counties.keys())
county_list[0]


a = county_list[0]
b = a.split(',')
print(f'county_list[0]: {a}')
print(f'The split list: {b}')
print(f'Just the county: {b[0]}')
print(f'Just the state: {b[1]}')


# Create an empty list object
colorado_counties = []

# Iterate through all the key value in the counties dict.
for county in counties.keys():
    # Append just the county to our new list
    colorado_counties.append(county.split(',')[0])

# Display the results    
colorado_counties


import requests
from bs4 import BeautifulSoup


# First we need to define the website and set up a get request for it.
wiki_url = requests.get("https://en.wikipedia.org/wiki/2020_United_States_census").text

# Now we can use BeautifulSoup to scrape the entire wiki pages html and other code.
# We will use the 'lxml' parser instead of the 'html.parser' namely because its still faster but you can use html.parser with same resuls for Python v3.2 or later
soup = BeautifulSoup(wiki_url, "lxml")


#soup


# Extracting table of all the 50 US states, District of Columbia and total for the US
us_states_table = soup.find("table", {"class": "wikitable sortable"})


#us_states_table


# Defining a function to convert the HTML response into a list
def parse_data_from_table(table_code):
    rows = table_code.findAll('tr')                      # Find all the lines which defines an html table row ('tr')
    parsed_table_data = []                               # Create a new, empty, list to put all the data into
    for row in rows:                                     # Iterate over all the rows
        children = row.findChildren(recursive=False)     # Find the children of the row but ignore multiple layers of childern (as in grand children, great grand children, ect...)
        row_text = []                                    # Create a new, empty, list where we will store the actual data we want for each row
        for child in children:                           # Iterate over the children which will become the columns in our table
            clean_text = child.text                      # Discard reference/citation links
            clean_text = clean_text.split('&#91;')[0]    # Split on '[' to remove the header row of the sort arrow icons
            clean_text = clean_text.strip()              # Strip end of line '\n' characters
            row_text.append(clean_text)                  # Append each column to our list, 'clean_text'
        
        parsed_table_data.append(row_text)               # Add the row to our table list
    
    return parsed_table_data                             # returned the list of lists which is our table


# Converting list into a dataframe and selecting the top 8 cities with the highest population
us_cities_population_data = parse_data_from_table(us_states_table)


us_cities_population_data


df = pd.DataFrame(us_cities_population_data)


import pandas as pd


df = pd.read_parquet('census_data.parq')


#df = pd.read_parquet('/home/jovyan/work/shared/Python_session_data/census_data.parq')

df


df.head()


df.tail()


len(df)


df.shape


df = df.head(1000000)

df.shape


# Look at all the unique values and their data type
df['Sex'].unique()


# sorted(df['Sex'].unique())


df['Sex'].value_counts()


df['Sex'].replace({0: 'Female', 1: 'Male'}, inplace=True)

df.head()


sorted(df['Education'].unique())


df['Education'].value_counts()


# Get a list of the sorted value_counts
sorted_counts = sorted(df['Education'].value_counts())

# Iterate through the sorted list and print the results.
for idx,val in enumerate(sorted_counts):
    print(f'code: {idx}     Value Count: {val}')


edu_list=['No Education', 'Upto 4th grade', '5th-6th grade', '7th-8th grade', '9th grade', '10th grade', '11th grade', '12th grade',
          'High schools', 'College (<1yr)', 'College (No degree)', 'Associates degree', 'Bachelors degree', 'Masters degree', 'Professional school', 'Doctorates degree', 'Under age 16']

sorted_counts = sorted(df['Education'].value_counts())

for idx,val in enumerate(sorted_counts):
    print(f'token: {idx}\tcount: {val}\t\tCategory: {edu_list[idx]}')


'''
%%time
edu_list= ['No Education', 'Upto 4th grade', '5th-6th grade', '7th-8th grade', '9th grade', '10th grade', '11th grade', '12th grade',
          'High schools', 'College (<1yr)', 'College (No degree)', 'Associates degree', 'Bachelors degree', 'Masters degree', 'Professional school', 'Doctorates degree', 'Under age 16']


#df['Sex'].replace({0: 'Female', 1: 'Male'}, inplace=True)

for idx, val in enumerate(edu_list):
    df['Education'].replace({idx: val}, inplace=True)
   
df.head()

#Wall time: 4min 40s
'''


%%time
edu_dict= {0: 'No Education', 
           1: 'Upto 4th grade', 
           2: '5th-6th grade', 
           3: '7th-8th grade', 
           4: '9th grade', 
           5: '10th grade', 
           6: '11th grade', 
           7: '12th grade',
           8: 'High schools', 
           9: 'College (<1yr)', 
           10: 'College (No degree)', 
           11: 'Associates degree', 
           12: 'Bachelors degree', 
           13: 'Masters degree', 
           14: 'Professional school', 
           15: 'Doctorates degree', 
           16: 'Under age 16'}

df['Education'].replace(edu_dict, inplace=True)

#Wall time: 18.2 s #Note this was with the full 308 million rows!

df


income_list=['$2,499', '$4,999', '$7,499', '$9,999', '$12,499', '$14,999', '$17,499', '$19,999', '$22,499', 
             '$24,999', '$29,999', '$34,999', '$39,999', '$44,999', '$49,999', '$54,999', '$64,999', '$74,999', 
             '$99,999', '$100,000+', 'Under age 25']

a = sorted(df['Income'].value_counts())
for idx,val in enumerate(a):
    print(f'token: {idx}\tcount: {val}\t\tCategory: {income_list[idx]}')


income_dict= {0: '$2,499', 
              1: '$4,999', 
              2: '$7,499', 
              3: '$9,999', 
              4: '$12,499', 
              5: '$14,999', 
              6: '$17,499', 
              7: '$19,999', 
              8: '$22,499', 
              9: '$24,999',  
              10: '$29,999', 
              11: '$34,999', 
              12: '$39,999', 
              13: '$44,999', 
              14: '$49,999', 
              15: '$54,999', 
              16: '$64,999', 
              17: '$74,999', 
              18: '$99,999', 
              19: '$100,000+', 
              20: 'Under age 25'
             }
              
df['Income'].replace(income_dict, inplace=True)

df


cow_list=['Employed', 'Self employed', 'Non-profit', 'Local Gov.', 'State Gov.', 'Federal Gov', 
          'Self employed non-business', 'Unpaid work', 'Under age 16']
a = sorted(df['Worker Class'].value_counts())
for idx,val in enumerate(a):
    print(f'token: {idx}\tcount: {val}\t\tCategory: {cow_list[idx]}')


cow_dict= {0: 'Employed', 
           1: 'Self employed', 
           2: 'Non-profit', 
           3: 'Local Gov.', 
           4: 'State Gov.', 
           5: 'Federal Gov', 
           6: 'Self employed non-business', 
           7: 'Unpaid work', 
           8: 'Under age 16'
         }

df['Worker Class'].replace(cow_dict, inplace=True)

df


xmin = df['easting'].min()
xmax = df['easting'].max()
ymin = df['northing'].min()
ymax = df['northing'].max()

print(f'xmin: {xmin}     xmax: {xmax}')
print(f'ymin: {ymin}     ymax: {ymax}')


import numpy as np
import pandas as pd 

import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf

init_notebook_mode(connected=True)
cf.go_offline()


arr_1 = np.random.rand(25,4)
df_demo = pd.DataFrame(arr_1, columns=['A', 'B', 'C', 'D'])
df_demo.head(10)


df_demo.plot()


df_demo.iplot()


# Load in the example stocks data and plot Amazon and Netflix stocks
df_stocks = px.data.stocks()

px.line(df_stocks, x='date', y=['AMZN', 'NFLX'], title='Amazon vs. NetFlix Stocks')


# Load in the example stocks data and plot Amazon and Netflix stocks
df_stocks = px.data.stocks()

# Create a figure to which we add our plots
fig = go.Figure()

# Add individual columns of data and set plot styling
fig.add_trace(go.Scatter(x=df_stocks.date, y=df_stocks.AAPL, mode='markers', name='Apple'))
fig.add_trace(go.Scatter(x=df_stocks.date, y=df_stocks.AMZN, mode='lines', name='Amazon'))
fig.add_trace(go.Scatter(x=df_stocks.date, y=df_stocks.NFLX, mode='lines+markers', name='Netflix'))

# Further style the figure (more on this later)
fig.update_layout(title='Amazon vs. NetFlix Stocks', xaxis_title='Price', yaxis_title='Date')

fig.show()


# Calculate the values for the x and y axis.
x = list(df['Sex'].unique())
y = df['Sex'].value_counts()


# Create a express bar chart 
fig = px.bar(x=x, y=y)

# Style the figure (more on this later)
fig.update_layout(title='2010 Census Females Rule the US!',
                  xaxis_title='Population', 
                  yaxis_title='Sex')
fig.show()


%%time
# Calculate the values for the x and y axis.
x = edu_list
y = sorted(df['Education'].value_counts())


fig = px.bar(df, 
              x = x, 
              y = y, 
              color = y,
              orientation = 'v',
              color_continuous_scale = px.colors.sequential.Viridis)

fig.update_layout(title = '<b>2010 Census Data - Education Attainment</b>',
                  width = 1020,
                  height = 800,
                  xaxis_title = '<b>Population</b>',
                  yaxis_title = '<b>Education Atainment</b>',
                  legend_title = '<b>Population</b>',
                  font = dict(family='Arial', size=18)
                 )
fig.show()


fig = px.bar(df, 
              x = y, 
              y = x, 
              color = x,
              orientation = 'h',
              color_continuous_scale = px.colors.sequential.Viridis)

fig.update_layout(title = '<b>2010 Census Data - Education Attainment</b>',
                  width = 1020,
                  height = 800,
                  xaxis_title = '<b>Population</b>',
                  yaxis_title = '<b>Education Atainment</b>',
                  legend_title = '<b>Population</b>',
                  font = dict(family='Arial', size=18)
                 )
fig.show()


# Calculate the values for the x and y axis.
x = (df['Age'].unique())
y = (df['Age'].value_counts())


fig = go.Figure()

# Calculate the number of bins to display
nbins = int(np.floor(len(x)/1))

fig = px.histogram(x = x, 
                   y = y, 
                   nbins = nbins,
                   color = x,
                   color_discrete_sequence=px.colors.qualitative.Dark24,
                   )

fig.update_layout(title='<b>2010 Census Data - Ages of the Population</b>', 
                  width = 1020,
                  height = 800, 
                  xaxis_title='<b>Age</b>', 
                  yaxis_title='<b>Population</b>',
                  font = dict(family='Arial', size=18)
                 )
fig.show()


# Create unique dataframes for each gender
female_education = df.query('Sex=="Female"')
male_education = df.query('Sex=="Male"')


# Calculate the values for the x and y axis for each gender
x = edu_list
y_female = sorted(female_education['Education'].value_counts())
y_male = sorted(male_education['Education'].value_counts())


# Create a figure to which we add our plot
fig = go.Figure()

#Create the traces and plot them at the same time
fig.add_trace(go.Bar(x=edu_list, 
                     y=y_female, 
                     name='Female'))

fig.add_trace(go.Bar(x=edu_list,
                     y=y_male,
                     name='Male'))

fig.update_layout(title='<b>2010 Census - Education Attainment by Gender</b>',
                  width = 1200,
                  height = 800, 
                  xaxis_title='<b>Educational Attainment</b>', 
                  yaxis_title='<b>Population</b>',
                  legend_title = '<b>Gender</b>',
                  font = dict(family='Arial', size=18)
                 )

fig.show()


labels = cow_list
values = sorted(df['Worker Class'].value_counts())


fig = go.Figure(data=[go.Pie(labels = labels,                     # Set the labels to be used for both the legend as well as for the wedges
                             values = values,                     # Set the values which is used to calculate the wedge sizings plus we will use it in the wedge info
                             textinfo='label+percent',            # This provides the label  and the actual value to be displayed in each wedge
                             pull=[0, 0, 0, 0, 0, 0, 0.2, 0, 0]   # Create a list of how far to extrude each wedge
                            )
                     ]
               )
                      

# Style the figure 
fig.update_layout(title='<b>2010 Census - Worker Class </b>',
                  width = 1200,
                  height = 1200, 
                  legend_title = '<b>Class of Worker</b>',
                  font = dict(family='Arial', size=18)
                 )


fig.show()


import datashader as ds
import datashader.transfer_functions as tf
from datashader.utils import lnglat_to_meters as webm
import pandas as pd
import numpy as np

from functools import partial
from datashader.utils import export_image
from datashader.colors import colormap_select, Greys9, viridis
from IPython.core.display import HTML, display


df = pd.read_parquet('census_data.parq')


USA = ((-124.72,  -66.95), (23.55, 50.06))
CO = ((-109.0489, -102.0424) ,(36.9949, 41.0006))


# Break up the coors for defined region into lat/lon ranges
x_range, y_range = [list(r) for r in webm(*USA)]

# Define plot size
plot_width  = int(1000)
plot_height = int(plot_width*7.0/12)

# Define the background color
background = "black"


cvs = ds.Canvas(plot_width, plot_height, *webm(*USA))
agg = cvs.points(df, 'easting', 'northing')


export = partial(export_image, background = background, export_path="export")
cm = partial(colormap_select, reverse=(background!="black"))


export(tf.shade(agg, cmap = cm(Greys9, 0.2), how='log'),"census_gray_log")


export(tf.shade(agg, cmap=cm(viridis), how='log'),"census_viridis_log")


export(tf.shade(agg, cmap=cm(viridis), how='eq_hist'),"census_viridis_eq_hist")


%%time
# Break up the coors for defined region into lat/lon ranges
x_range, y_range = [list(r) for r in webm(*CO)]

cvs = ds.Canvas(plot_width, plot_height, *webm(*CO))
agg = cvs.points(df, 'easting', 'northing')

export = partial(export_image, background = background, export_path="export")
cm = partial(colormap_select, reverse=(background!="black"))

export(tf.shade(agg, cmap=cm(viridis), how='eq_hist'),"Colorado_viridis_eq_hist")


fig = go.Figure(data=[go.Sankey(
    node = dict(pad = 15,
                thickness = 20,
                line = dict(color = "black", width = 1),
                label = ["Item 0", "Item 1", "Item 2", "Item 3", "Item 4", "Item 6"],
                color = "cyan"
                ),
    # indices correspond to labels, eg Item 0 = 0,  Item 1 = 1, Item 2 = 2, ect...
    link = dict(source = [0, 1, 0, 2, 3, 3], 
                target = [2, 3, 3, 4, 4, 5],
                value  = [10, 5, 3, 8, 6, 2], 
                color='pink',
  ))])

fig.update_layout(title_text="Sankey Diagram", font_size=12)
fig.show()


source_target = [
    ["CRDDS", "Home"],
    ["CRDDS", "What We Do"],
    ["CRDDS", "Our People"],
    ["CRDDS", "Learning Material"],
    ["CRDDS", "Events"],
    ["CRDDS", "Blogs Page"],
    ["CRDDS", "Contact Us"],

    ["RC", "Resources"],
    ["RC", "Services"],
    ["RC", "Our People"],
    ["RC", "Events"],
    ["RC", "News"],
    ["RC", "About RC"],
    ["RC", "Contact Us"],

    ["LISA", "About Us"],
    ["LISA", "Services"],
    ["LISA", "Resources"],
    ["LISA", "Contact Us"],

    ["Facebook", "Home"],
    ["Facebook", "Tutorials Page"],
    ["Facebook", "Blogs Page"],
    ["Facebook", "Contact Page"],
    ["Facebook", "About Page"],

    ["Twitter", "Home"],
    ["Twitter", "Tutorials Page"],
    ["Twitter", "Blogs Page"],
    ["Twitter", "Contact Page"],
    ["Twitter", "About Page"],

    ["EDV", "Home"],
    ["EDV", "Tutorials Page"],
    ["EDV", "Blogs Page"],
    ["EDV", "Contact Page"],
    ["EDV", "About Page"],

    ["Tutorials Page", "Exit"],
    ["Tutorials Page", "Python Tutorial"],
    ["Tutorials Page", "ML Tutorial"],
    ["Tutorials Page", "AI Tutorial"],
    ["Tutorials Page", "Data Science Tutorial"],
    ["Tutorials Page", "Digital Marketing Tutorial"],
    ["Tutorials Page", "Android Tutorial"],

    ["Blogs Page", "Exit"],
    ["Blogs Page", "Python Blog"],
    ["Blogs Page", "ML Blog"],
    ["Blogs Page", "AI Blog"],
    ["Blogs Page", "Data Science Blog"],
    ["Blogs Page", "Digital Marketing Blog"],
    ["Blogs Page", "Android Blog"],

    ["Python Blog", "Exit"],
    ["Python Blog", "ML Blog"],
    ["Python Blog", "AI Blog"],
    ["Python Blog", "Data Science Blog"],

    ["Data Tutorial", "Python Tutorial"],
    ["Data Tutorial", "Exit"],
    ["Data Tutorial", "AI Tutorial"],
    ["Data Tutorial", "Data Science Tutorial"],
]


# Create a dataframe from the above 
website_vists = pd.DataFrame(source_target, columns=["Source", "Target"])

# Create the rnd numbers that represent the number of vistors to each page
website_vists["Vists"] = np.random.randint(1,1000, size=website_vists.shape[0])

#Display the results
website_vists.head(10)


all_nodes = website_vists.Source.values.tolist() + website_vists.Target.values.tolist()
source_indices = [all_nodes.index(src) for src in website_vists.Source]
target_indices = [all_nodes.index(tar) for tar in website_vists.Target]


fig = go.Figure(data=[go.Sankey(
    node = dict(pad = 20,
                thickness = 20,
                line = dict(color = "black", width = 1.0),
                label =  all_nodes,
                color =  'red',
              ),

    link = dict(source =  source_indices,
                target =  target_indices,
                value =  website_vists.Vists,
))])

fig.update_layout(title_text='<b>Website Visits Sankey Diagram</b>',
                 width = 1220,
                 height = 800,
                 )

fig.show()


colors = px.colors.qualitative.D3
node_colors = [np.random.choice(colors) for node in all_nodes]
edge_colors = [np.random.choice(colors) for node in all_nodes]


fig = go.Figure(data=[go.Sankey(
    node = dict(pad = 20,
                thickness = 20,
                line = dict(color = "black", width = 1.0),
                label =  all_nodes,
                color =  node_colors,
               ),

    link = dict(source =  source_indices,
                target =  target_indices,
                value =  website_vists.Vists,
                color = edge_colors,
))])

fig.update_layout(title_text="Website Visits Sankey Diagram",
                  width= 1200,
                  height=1000,
                  font=dict(size = 12, color = 'white'),
                  plot_bgcolor='black', paper_bgcolor='black')

fig.show()

A quick tour of Jupyter Lab.¶

Jupyter command line interfacing.¶

An Introduction to Important Python Concepts.¶

Main data structures - Lists, Tuples, and Dictionaries¶

Obtaining Data¶

The censusdata Package for Downloading Census Data.¶

Using censusdata to 'search' for census data¶

For loops¶

Printing conventions I will be using¶

Iterating through a dictionary¶

Looking at the 'keys' of a dictionary¶

How to get a single specific key¶

BeautifulSoup and WebScrapping¶

A Quick Introduction to Python Functions¶

An Introduction to Pandas¶

Paths to auxiliary data¶

Lets look at the data¶

Column 2 - Sex¶

Code Table¶

Column 3 - Education¶

Column 4 - Income¶

Column 5 - Class of Worker¶

Column 6 - Age¶

Columns 0 and 1 Location information¶

A Quick Introduction to Plotting¶

Plotly + Dash + other packages¶

Basic Plotly Figure Architecture¶

Major Ploty Modules¶

Plotly Examples, Documentation, and Example Data Packages¶

Basic Plotly Plots¶

Plotly Express(px) versus Plotly Graph_Objects(go)¶

Plotly Express¶

Plotly Graph_Objects¶

Lets move back to our census data and explore it better¶

Histograms¶

Pandas Query to help compare data.¶

Next up - Pie Charts¶

Datashader¶

Sankey Diagrams (if time permits)¶

Is Pandas the end all be all for data science?¶

Is Plotly the end all be all for data visualization?¶