import numpy as np
import pandas as pd
import os

import plotly.graph_objects as go
import plotly.express as px

import nltk
from nltk.corpus import stopwords

import re
import unicodedata
from collections import Counter
from ast import literal_eval

import pycountry


df = pd.read_csv(os.path.join('Processed_data', 'Combined_lists.csv'))


def get_page_counts(df):
    pc = df['Number of Pages'].value_counts().rename_axis('NumPages').reset_index(name='Counts')
    return pc


pc = get_page_counts(df)
pc


fig = px.scatter(x=pc['NumPages'], 
                 y=pc['Counts'],
                ) 
fig.show()


# Lets look at the number of pages just for Journals and send it into a new DataFrame
jpdf = df.loc[df['Publication Type'] == 'Journal']
jpdf = jpdf['Number of Pages'].to_frame()

# Now lets run this new DataFrame of just Journals info into our handy function.
pc = get_page_counts(jpdf)
pc


maxdf = df[df['Number of Pages'] == df['Number of Pages'].max()]
maxdf


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Error loading stopwords: <urlopen error [WinError 10054]
[nltk_data]     An existing connection was forcibly closed by the
[nltk_data]     remote host>
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tdunn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tdunn\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!

True


ADDITIONAL_STOPWORDS = ['']


def basic_clean(text):
    """ basic_clean() - A simple function to clean up the data. All the words that
      are not designated as a stop word is then lemmatized after
      encoding and basic regex parsing are performed.

      Using 'NFKD' normalization (https://docs.python.org/3/library/unicodedata.html)

    Params:
          text (DataFrame column) - Column with text to lemmatize

    Returns:
        lemmed (list) list of lemmatized words

    """
    # Create a list of words from the inputted text in lemmatized form
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Build stop words to remove from wnl
    # Note the use of our ADDITIONAL_STOPWORDS.
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    
    # Normilize the text using NKFD normilzation (https://docs.python.org/3/library/unicodedata.html)
    text = (unicodedata.normalize('NFKD', text)
        .encode('ascii', 'ignore')
        .decode('utf-8', 'ignore')
        .lower())
    
    # Use regex to search ('^') for sequences of two characters with one non-word, and one non-space ('\w\s') 
    # splitting the text into individual words
    words = re.sub(r'[^\w\s]', '', text).split()

    # Remove stop words from rhe list
    lemmed = [wnl.lemmatize(word) for word in words if word not in stopwords]

    return lemmed


df = pd.read_csv(os.path.join('Processed_data', 'Combined_lists.csv'))


%%time
words = basic_clean(''.join(str(df['Abstract'].tolist())))

Wall time: 19.2 s


len(words)

1785509


words[0:15]

['climate',
 'change',
 'established',
 'scientific',
 'fact',
 'dealing',
 'may',
 'require',
 'significant',
 'shift',
 'consumption',
 'economic',
 'organization',
 'key',
 'question']


def build_gram(words, depth, num):
    """ build_gram() - Builds ngram word groupings for the lemmatized text.
    
    Params: 
          words (list) - cleaned, lemma text (text cleaned by running basic_clean() first)
          depth (int) - Depth of ngram build out (eg 2 for bigrams, 3 for trigrams, ect)
          num (int) - Number of top ngrams to return
      
    Returns:
        ngram (DataFrame)
    """
    # Create a Dataframe of words in ngram groupings (depth) with the top 'num' entries
    ngram = (pd.Series(nltk.ngrams(words, depth)).value_counts())[:num].to_frame()
    
    # Reset the DataFrame index, for cleaness more then anything else
    ngram.reset_index(inplace=True)
    
    # Rename the header column names
    ngram = ngram.rename(columns={'index':'ngram', 0:'count'})
    
    # Cast all words to strings for cleaness when plotting 
    ngram['ngram'] = ngram['ngram'].astype(str)
    
    # Sort the DataFrame based on counts
    ngram.sort_values('count', ascending=False, inplace=True)
    
    ngram['ngram'] = ngram['ngram'].map(lambda x: x.lstrip("('").rstrip("',)"))
    
    return ngram


def plot_ngram(ngram, title):
    # Create a figure we will add our chart to
    fig = go.Figure()

    # Add a bar chart to the figur
    fig.add_trace(go.Bar(x = ngram['ngram'],                                # x-axis data
                         y = ngram['count'],                                # y-axis data
                         text=ngram['count'],                               # Textual data displayed above each bar
                         textposition='outside',                            # Location of 'text'
                         orientation='v',                                   # We want veritcle bars, and 'h' would give us horizontal bars
                        )
                 )

    fig.update_layout(title=f'<b>{title} Analysis</b>',                     # Title of figure/chart
                      width = 1400,                                         # Width of figure
                      height = 800,                                         # Height of figure
                      xaxis_title="<b>ngrams</b>",                          # x-axis label
                      yaxis_title="<b>ngram count (in thousands k)</b>",    # y-axis label
                      font=dict(family="Arial",                             # dict. specifing font details
                                size=16,
                            )
                     )

    fig.show()                                                             # Display the figure


ngram = build_gram(words, depth=1, num=20)
plot_ngram(ngram,title='Unigram')


ngram = build_gram(words, depth=2, num=20)
plot_ngram(ngram,title='Bigram')


ngram = build_gram(words, depth=3, num=20)
plot_ngram(ngram,title='Trigram')


ADDITIONAL_STOPWORDS = ['ltd', 'right', 'reserved', 'elsevier']

words = basic_clean(''.join(str(df['Abstract'].tolist())))
ngram = build_gram(words, depth=3, num=20)
plot_ngram(ngram,title='Trigram')


df = pd.read_csv(os.path.join('Processed_data', 'Combined_lists.csv'))


ndf = pd.DataFrame({'Pub_Year':df['Publication Year'], 'Cited_Count':df['Cited Reference Count']})
ndf.dropna(inplace=True)
ndf['Pub_Year'].isna().sum()

0


ndf['Pub_Year']

0        2010.0
1        2014.0
2        2022.0
3        2015.0
4        2020.0
          ...  
12681    2020.0
12682    2021.0
12683    2011.0
12684    2019.0
12685    2021.0
Name: Pub_Year, Length: 12527, dtype: float64


ndf['Pub_Year'] = ndf['Pub_Year'].astype('int32')
ndf['Pub_Year'].unique()

array([2010, 2014, 2022, 2015, 2020, 2018, 2017, 2019, 2009, 2012, 2006,
       2011, 2021, 2013, 2008, 2016, 2007, 2002, 2005, 1995, 2001, 2004,
       2023, 1997, 2000, 1999, 2003, 1996, 1956, 1998, 1978, 1991, 1990,
       1979, 1994, 1982, 1969, 1963, 1992, 1988, 1993, 1973])


#Remove preprints and errors for 2023 (data gathered in Dec 2022)
ndf = ndf[ndf['Pub_Year'] != 2023]
ndf['Pub_Year'].unique()

array([2010, 2014, 2022, 2015, 2020, 2018, 2017, 2019, 2009, 2012, 2006,
       2011, 2021, 2013, 2008, 2016, 2007, 2002, 2005, 1995, 2001, 2004,
       1997, 2000, 1999, 2003, 1996, 1956, 1998, 1978, 1991, 1990, 1979,
       1994, 1982, 1969, 1963, 1992, 1988, 1993, 1973])


ndf2 = ndf['Pub_Year'].value_counts().to_frame()                                         # Create a new DataFrame with the counts for each publication year
ndf2.reset_index(inplace=True)                                                           # Make the index (with the counts) into a new column and not the index
ndf2.rename({'Pub_Year':'Num_Published', 'index':'Pub_Year'}, axis=1, inplace=True)      # Rename the column header names
ndf2.sort_values(by='Pub_Year', inplace=True)                                            # Sort the data by publication year in ascending order
ndf2.reset_index(drop=True, inplace=True)                                                # Clean up the index
ndf2                                                                                     # Display results


cit_df = ndf.groupby(['Pub_Year']).sum()
ndf2['Cited_Count'] = cit_df['Cited_Count'].to_list()
ndf2.tail()


fig = go.Figure()                                                                                    # Create a new plotly.go figure
fig.add_trace(go.Scatter(x=ndf2['Pub_Year'],                                                         # X-axis = publication year 
                         y=ndf2['Num_Published'],                                                    # Y-axis = Number of items published for each year
                         name = 'Cited',                                                             # A special name which has numerous uses which we will explain in plot
                         mode='markers',                                                             # The kind of scatter plot, just makers, we could add lines but we dont want any
                         marker=dict(color=ndf2['Cited_Count'],                                      # Describe what the markers look like, set theier color based on citation count
                                     size=(np.log(ndf2['Cited_Count']+.01)**2),                      # Set the size of the markers on a log scale
                                     showscale=True,                                                 # Display the color scale (transfer function colorbar)
                                     colorbar=dict(title='<b>Number of Citiations per Year</b>'),    # Set the title for the colorscale
                                     opacity=0.75,                                                   # Reduce the opacity of the markers
                                    ),
                         hovertemplate="<b>Year:</b> %{x}"+                                          # Display the publication year for moused over/hover marker
                                       "<br><b># Publications:</b> %{y}"+                            # Display the number of publication/year for moused over/hover marker
                                       "<br><b># Citations made:</b> %{text}",                       # Display the citations/year for moused over/hover marker - Note we need the text
                         text = ndf2['Cited_Count']                                                  # Set the 'text' used used for the above line
                       )
             )

fig.update_layout(title="<b>Web of Science: 'Climate and Art'<br>Number of Publications and Citations per Year</b>", # Set the title for the figure/plot
                  xaxis_title='<b>Year</b>',                                                         # Set the X-axis label title
                  yaxis_title='<b># Publications</b>',                                               # Set the Y-axis label title
                  width=1200,                                                                        # Set the figure width
                  height=800,                                                                        # Set the figure height
                  
                 )

fig.update_layout(hoverlabel=dict(bgcolor="white",                                                   # Set the marker hover over panel to background color to white
                                  font_size=16,                                                      # Set hover over font size
                                  font_family="Arial",                                               # Set hover over font family      
                                 )
                 )
fig.show()                                                                                           # Display the plot


df = pd.read_csv(os.path.join('Processed_data', 'Combined_lists.csv'))


temp = []                                                                # Create an temporary empty list
for i in range(df.shape[0]):                                             # Iterate through each row (yes this is different then iterrows but does the exact same thing)
    if df['Country'][i] =='None':                                        # If the row contains a 'None' which we added earliy skip it and contiue on with the for-loop
        continue                                                                 
    else:
        temp.append(literal_eval(df['Country'][i]))                      # use ast.lietral_eval function to extract the list items out correctly, for us. then appened the results to our temporary list

author_countries = [item for sublist in temp for item in sublist]        # Now iterate through the temporary list and flatten it into a flattened list for ech country


u_countries = Counter(author_countries).keys()                           # Create a 'Counter' for each country and get the dictionary 'keys' from the Counter
u_count = Counter(author_countries).values()                             # Create a 'Counter' for each country and get the dictionary 'values' from the Counter which is the count of authors per country
df_countries = pd.DataFrame({'Country':u_countries, 'Counts':u_count})   # Create a new DataFrame with this data


temp = []                                                                # Create a temporary list to stor FIPS codes into
for i in df_countries['Country']:                                        # Iterate through each row/country 
    temp.append(pycountry.countries.get(name=i).alpha_3)                 # use Pycountry to find the 'alpha_3' 3 letter country FIPS code and append it to our temporary list

df_countries['FIPS'] = temp                                              # Add this information into our DataFrame as a new 'FIPS' column


df_countries


fig = go.Figure()                                                           # Create a new plotly.go figure

fig.add_trace(go.Choropleth(                                                # Add a Choropleth plot to the figure
                            locations = df_countries['FIPS'],               # Use our FIPS codes to define the regions of interest (ROI) for the choropleth
                            z = df_countries['Counts'],                     # Use our author counts/country as the data source for each FIPS coded country
                            colorscale = 'Turbo',                           # Set the transfer function color map to 'Turbo'
                            marker_line_color='darkgray',                   # Set the country borders color to dark gray
                            marker_line_width=0.5,                          # Set the country border line width 
                           )
             )

scale = 1.5                                                                 # Create a scale factor for sizing the figure
fig.update_layout(
                    width = 1024*scale,                                     # Set the figure width
                    height=728*scale,                                       # Set the figure height
                    geo=dict(                                               # Create a dictionariy describing how to customize the choropleth
                              showframe=False,                              # Do not show a frame around the plot
                              showcoastlines=True,                          # Show coastline borders
                              projection_type='equirectangular'             # Set the geo 'projection' we want to display our map in
                             ),
                 )

fig.show()                                                                  # Display the figure


fig = go.Figure()                                                            # Create a new plotly.go figure

fig.add_trace(go.Choropleth(                                                 # Add a Choropleth plot to the figure
                            locations = df_countries['FIPS'],                # Use our FIPS codes to define the regions of interest (ROI) for the choropleth
                            z = df_countries['Counts'],                      # Use our author counts/country as the data source for each FIPS coded country
                            colorscale = 'Turbo',                            # Set the transfer function color map to 'Turbo'
                            marker_line_color='darkgray',                    # Set the country borders color to dark gray
                            marker_line_width=0.5,                           # Set the country border line width 
                            colorbar_title = '<b>Authors per Country</b>',   # Add a title to our Colorbar
                            text = df_countries['Country'],                  # The textual name for each FIPS coded region (which is the country name in our case)
                            hovertemplate="<b>Country:   </b>  %{text}"+     # Display the country name for moused over/hover marker
                                          "<br><b># Authors:</b>  %{z}",     # Display the number of authors/country for moused over/hover marker
                           )
             )

scale = 1.5                                                                  # Create a scale factor for sizing the figure
fig.update_layout(
                    title_text='<b>Number of Authors per Country</b>',       # Add a title to our figure/plot
                    width = 1024*scale,                                      # Set the figure width
                    height=728*scale,                                        # Set the figure height
                    geo=dict(                                                # Create a dictionariy describing how to customize the choropleth
                              showframe=False,                               # Do not show a frame around the plot
                              showcoastlines=True,                           # Show coastline borders
                              projection_type='equirectangular'              # Set the geo 'projection' we want to display our map in
                             ),
                 )

fig.show()

	Pub_Year	Num_Published	Cited_Count
36	2018	996	70067
37	2019	1150	87021
38	2020	1370	101347
39	2021	1453	120629
40	2022	1498	125599

	Country	Counts	FIPS
0	United Kingdom	2226	GBR
1	Argentina	84	ARG
2	Canada	813	CAN
3	Netherlands	820	NLD
4	Spain	824	ESP
...	...	...	...
154	Seychelles	2	SYC
155	Uzbekistan	2	UZB
156	Guyana	1	GUY
157	Brunei Darussalam	2	BRN
158	Yemen	2	YEM

Part 2 - Exploratory Data Visualization¶

A Quick Introduction to Plotting¶

Basic Plotly Figure Architecture¶

Major Ploty Modules¶

Plotly Examples, Documentation, and Example Data Packages¶

Plotly Express - Scatter Plot¶

Natural Language Processing(NLP) with Python and NLTK¶

Adding More Customization to Your Plots¶

TIME PERMITTING - Else for your later edification
Global Choropleth of Author locations

Sadly thats all we have time for on plotting
QUESTIONS???

	NumPages	Counts
0	12	824
1	10	806
2	11	741
3	14	716
4	13	685
...	...	...
96	0	1
97	201	1
98	85	1
99	67	1
100	359	1

	NumPages	Counts
0	12	755
1	14	676
2	11	659
3	13	654
4	10	649
...	...	...
83	69	1
84	101	1
85	73	1
86	0	1
87	359	1

	Pub_Year	Num_Published
0	1956	1
1	1963	1
2	1969	1
3	1973	1
4	1978	1
5	1979	1
6	1982	1
7	1988	1
8	1990	8
9	1991	11
10	1992	8
11	1993	10
12	1994	20
13	1995	24
14	1996	29
15	1997	40
16	1998	42
17	1999	47
18	2000	61
19	2001	47
20	2002	52
21	2003	57
22	2004	73
23	2005	159
24	2006	185
25	2007	234
26	2008	250
27	2009	245
28	2010	311
29	2011	385
30	2012	436
31	2013	510
32	2014	520
33	2015	593
34	2016	767
35	2017	910
36	2018	996
37	2019	1150
38	2020	1370
39	2021	1453
40	2022	1498

	NumPages	Counts
0	12	824
1	10	806
2	11	741
3	14	716
4	13	685
...	...	...
96	0	1
97	201	1
98	85	1
99	67	1
100	359	1

	NumPages	Counts
0	12	755
1	14	676
2	11	659
3	13	654
4	10	649
...	...	...
83	69	1
84	101	1
85	73	1
86	0	1
87	359	1

Part 2 - Exploratory Data Visualization¶

A Quick Introduction to Plotting¶

Basic Plotly Figure Architecture¶

Major Ploty Modules¶

Plotly Examples, Documentation, and Example Data Packages¶

Plotly Express - Scatter Plot¶

Natural Language Processing(NLP) with Python and NLTK¶

Adding More Customization to Your Plots¶

TIME PERMITTING - Else for your later edificationGlobal Choropleth of Author locations

Sadly thats all we have time for on plottingQUESTIONS???

TIME PERMITTING - Else for your later edification
Global Choropleth of Author locations

Sadly thats all we have time for on plotting
QUESTIONS???

	NumPages	Counts
0	12	824
1	10	806
2	11	741
3	14	716
4	13	685
...	...	...
96	0	1
97	201	1
98	85	1
99	67	1
100	359	1

	NumPages	Counts
0	12	755
1	14	676
2	11	659
3	13	654
4	10	649
...	...	...
83	69	1
84	101	1
85	73	1
86	0	1
87	359	1