ML-weeding-prediction/cont_modules.py

import requests, xmltodict, json, csv, re, datetime, time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# from email_test import *
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from dateutil.parser import parse
from sklearn.preprocessing import StandardScaler
from config1 import *

## CUDA/tensorflow reqs
# import cupy as xp
# import sklearn.model_selection
# from sklearn.datasets import load_digits
# from svm import SVM

def construct_ID(step_name, collection_code):
## currently takes nothing and constructs the file name for the attached sent file, the file downloaded from email response, and the file loaded after saving
    today = datetime.datetime.now().date()
    # today = '2020-08-18'
    # if t_o == True:
    #     today = ''

    return ('%s_%s_%s.csv' % (today, collection_code, step_name))

def write_csv_from_list(cID, wrapper_list):
## takes ID and csv wrapper to write file with ID name
## need to use different paths, can add in folder structure to cID field
    with open('./%s' % cID, 'w', encoding='utf-8') as f:
        csv.writer(f).writerows(wrapper_list)

def read_csv_to_list_wrapper(path):
    wrapper = []
    with open(path, 'r', encoding='utf-8') as f:
        csv_reader = csv.reader(f)
        for row in csv_reader:
            if row == []:
                ## handling for blank rows, which exist for some reason?
                pass
            else:
                wrapper.append(row)
    return wrapper

def add_ishare_to_wrapper(alma_wrapper):
    call_heading = 'Permanent Call Number'
    alma_wrapper[0].append('# in IShare')

    ## this isn't actually as bad as it looks
    for index, column_heading in enumerate(alma_wrapper[0]):
        if (column_heading == call_heading):
            for row in alma_wrapper[1:5]:
                time.sleep(15/10)
                primo_results = primo_api(row[index]).json()
                row.append(how_many_ishare(primo_results))
                # print('*******************************************')
                # print(row[index])
                # # print(is_available_ishare(row[index]))
                # print(how_many_ishare(primo_results))
                # break
            break

def how_many_ishare(primo_results):
    for c in primo_results['facets']:
        if (c['name'] == 'institution'):
            return len(c['values'])

def is_available_ishare(call_number):
    ## returns 1 if available in IShare, 0 if not
    primo_results = primo_api(call_number).json()
    if (not primo_results['docs'][0]['delivery']['almaInstitutionsList']):
        return 0
    else:
        # print(inst_code(primo_results))
        # print(availability_status(primo_results))
        return 1

def inst_code(primo_results):
    return primo_results['docs'][0]['delivery']['almaInstitutionsList'][0]['instCode']

def availability_status(primo_results):
    return primo_results['docs'][0]['delivery']['almaInstitutionsList'][0]['availabilityStatus']

def primo_api(query):
    ## takes primo query term and searches all ishare institutions, returns response. seems to work well using call number field

    api_key = <api_key>
    url = 'https://api-na.hosted.exlibrisgroup.com/primo/v1/pnxs?vid=01CARLI_UIS:CARLI_UIS&tab=NewDiscoveryNetwork&scope=NewDiscoveryNetwork&q=any,contains,'
    assembled = url + query + api_key
    response = requests.get(assembled)
    return response

def read_api_to_wrapper():
## pulls report from alma api, cleans the publication dates, adds column based on general/withdrawn collection status
## returns csv in a list of lists
## report details are determined within Alma, currently reporting on the whole general collection
    now = datetime.datetime.now()
    api_key = <api_key>
    url = 'https://api-na.hosted.exlibrisgroup.com/almaws/v1/analytics/reports'
    report_path = '?path=%2Fshared%2FUniversity of Illinois at Springfield (UIS) —Springfield%2C IL 01CARLI_UIS%2FReports%2Ftest_1'
    limiters = '&limit=1000&col_names=true'
    token = '&token='
    assembled = url + report_path + limiters + api_key
    response = requests.get(assembled)
    results = xmltodict.parse(response.text)
    is_finished = results['report']['QueryResult']['IsFinished']
    resumption_token = ''
    elements = results['report']['QueryResult']['ResultXml']['rowset']['xsd:schema']['xsd:complexType']['xsd:sequence']['xsd:element']
    headings = []
    col_count = 0
    numbered_columns = []
    for element in elements:
        column_num = 'Column' + str(col_count)
        numbered_columns.append(column_num)
        col_count = col_count + 1
        headings.append(element['@saw-sql:columnHeading'])
    result_list = [headings]
    rows = results['report']['QueryResult']['ResultXml']['rowset']['Row']
    temp = []
    ## this loop is probably too slow
    for row in rows:
        row_list = []
        for item in numbered_columns:
            # print(row.get(item) * None)
            # print(row[item])
            # row_list.append((row.get(item) != None) * (row.get(item)) + ((row.get(item) == None) * ('')))
            if row.get(item) != None:
                row_list.append(row[item])
            elif row.get(item) == None:
                row_list.append('')
        result_list.append(row_list)
        temp.append(len(row_list))


    ## continue query if more pages exist start
    if is_finished == 'false':
        resumption_token = results['report']['QueryResult']['ResumptionToken']
    while is_finished == 'false':
        results = xmltodict.parse(requests.get(assembled + token + resumption_token).text)
        if results.get('report') != None:
            rows = results['report']['QueryResult']['ResultXml']['rowset']['Row']
        elif results.get('report') == None:
            # print('no report')
            # print(results)
            break

        ## this loop is also probably too slow
        for row in rows:
            row_list = []
            for item in numbered_columns:
                # row_list.append(((row.get(item) != None) * (row[item])) + ((row.get(item) == None) * ('')))
                if row.get(item) != None:
                    row_list.append(row[item])
                elif row.get(item) == None:
                    row_list.append('')
            result_list.append(row_list)
            temp.append(len(row_list))
    ## continue query end

    print(result_list[0])
    return data_prep(result_list)

def translate_csv_reader(wrapper_list, column_index):
## translates a csv.reader from human readable 'yes', 'no' to '0' and '1'. takes column number to interpolate 'yes'/'1' for no answer
## can this optionally take more than one index rather than running multiple times in a row?
    header = wrapper_list[0]
    translated = []
    for row in wrapper_list:
        if row[column_index] != header[column_index]:
            if row[column_index] == '':
                row[column_index] = '1'
            row = [item.replace('no', '0') for item in row]
            row = [item.replace('yes', '1') for item in row]
        translated.append(row)
    return translated

def run_algorithm(collection_code, test_input, training_input):
## runs algorithm against provided list wrappers
## currently using sklearn linear svm
## NEEDS DATA NORMALIZATION?
    test_data = pd.DataFrame(test_input[1:], columns = test_input[0])
    training_data = pd.DataFrame(training_input[1:], columns = training_input[0])
    scaler = StandardScaler()

    drop = ['0', 'Creation Date', 'Material Type', 'Location Name', 'Internal Note 1', 'Internal Note 2', 'Internal Note 3', 'Lifecycle', 'Physical Condition', 'Receiving Date And Time', 'Last Loan Date', 'Last Loan Date (In House)', 'Last Loan Date (Not In House)', 'Publication Date']
    drop2 = ['Permanent Call Number', 'Withdrawn']

    X = training_data
    y = training_data['Withdrawn']
    X2 = test_data
    y2 = test_data['Withdrawn']

    for column in drop:
        X = X.drop(column, axis=1)
        X2 = X2.drop(column, axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

    # X_train = X
    # y_train = y
    # X_test = X2
    # y_test = y2
    test_data = X_test.copy(deep=True)

    for column in drop2:
        X_test = X_test.drop(column, axis=1)
        X_train = X_train.drop(column, axis=1)

    X_test['Num of Loans - not sum'] = X_test['Num of Loans - not sum'].astype(float)
    X_test['Years Since Publication'] = X_test['Years Since Publication'].astype(float)
    X_test['Current Shelf Time'] = X_test['Current Shelf Time'].astype(float)
    X_train['Num of Loans - not sum'] = X_train['Num of Loans - not sum'].astype(float)
    X_train['Years Since Publication'] = X_train['Years Since Publication'].astype(float)
    X_train['Current Shelf Time'] = X_train['Current Shelf Time'].astype(float)


    normalized_X_test = (X_test - X_test.mean())/X_test.std()
    normalized_X_train = (X_train - X_train.mean())/X_train.std()

    normalized_X_train.to_csv('./data/training_data/%s' % construct_ID('X_train', collection_code), encoding='utf-8', index=False)
    normalized_X_test.to_csv('./data/training_data/%s' % construct_ID('X_test', collection_code), encoding='utf-8', index=False)


    svclassifier = SVC(kernel='rbf', gamma=80)
    svclassifier.fit(normalized_X_train, y_train)

    y_pred = svclassifier.predict(normalized_X_test)

    ## translate start
    s = {'0':'no', '1':'yes'} ## translate to human string
    i = {0:'no', 1:'yes'} ## translate to human int
    test_data['Remove from collection'] = y_pred
    test_data['Remove from collection'] = test_data['Remove from collection'].replace(s)
    # test_data = test_data.drop('Withdrawn', axis=1)
    test_data['Do you agree?'] = [''] * len(test_data['Remove from collection'])

    ## translate end

    test_data.to_csv('./data/predictions/%s' % construct_ID('predictions', collection_code), encoding='utf-8', index=False)

    ## end sklearn algorithm

    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))

    ## tensorflow implementation start

    # y_test2 = y_test
    # x_train = xp.asarray(X_train)
    # x_test =  xp.asarray(X_test)
    # y_train = xp.asarray(y_train)
    # y_test = xp.asarray(y_test)

    # svm = SVM(kernel='linear', kernel_params={'sigma': 15}, classification_strategy='ovr', x=x_train, y=y_train, n_folds=9, use_optimal_lambda=True, display_plots=True)

    # # print(x_test)
    # # print('***************')
    # # print(y_test)

    # svm.fit(x=x_test, y=y_test)

    # # print(x_test)
    # y_pred = svm.predict(x_test)
    # print(y_pred)

    # misclassification_error = svm.compute_misclassification_error(x_test, y_test)
    # print('Misclassification error, lambda = {} : {}\n'.format(svm._lambduh, misclassification_error))

    ## tensorflow implementation end

def split_collection(csv_wrapper):
## splits the collection into a dictionary of lists where the key is the collection title/call prefix and the element is the collection of items
    ## rudimentary call number column detection, simply looks for 'call' in the heading (first row of csv)
    head = csv_wrapper[0]
    call_index = 0
    for heading in head:
        if 'call' in heading.lower():
            break
        else:
            call_index += 1

    ## reading and sorting by call prefix
    ## G might need special handling
    for a in dept_wrapper:
        dept_wrapper[a].append(head)
    for row in csv_wrapper:
        prefix0 = re.findall('^[a-zA-Z]{1,2}', row[call_index])
        prefix1 = prefix0[0]
        first_letter = prefix1[0]

        if prefix0 == []:
            print(row[call_index])
            print('ERROR')
        elif re.match('^GN', row[call_index]):
            dept_wrapper['GN_ANTH'].append(row)
        elif re.findall('^[a-zA-Z]{1,2}', row[call_index])[0] in H1:
            dept_wrapper['H_PS'].append(row)
        elif re.findall('^[a-zA-Z]{1,2}', row[call_index])[0] in H2:
            dept_wrapper['H_SL'].append(row)
        elif first_letter in ref_dict:
            dept_wrapper[ref_dict[first_letter]].append(row)
        else:
            ## catches HH, I, and Y. apparently we don't have those collections or something
            ## looks like anything that gets a yikes is in WITHDRAWN
            print(row[call_index])
            print('yikes')

    return dept_wrapper

def data_prep(result_list):
    ## clean publication dates and add withdrawn data column. returns dupe list if needed for comparison
    pubdate_index, call_no, last_loan, last_loan_in_index, last_loan_notin_index, location, rec_date = get_indices(result_list)
    now = datetime.datetime.now()
    csv_wrapper = []
    year = now.year
    unique_call = []
    duplicates = []
    weird_dates = []
    ## add calculated columns
    ## this calculation gives the data a perspective of when it was ran, as these values change over time
    result_list[0].append('Years Since Publication')
    result_list[0].append('Withdrawn')
    result_list[0].append('Current Shelf Time')
    csv_wrapper.append(result_list[0])
    duplicates.append(result_list[0])
    weird_dates.append(result_list[0])
    for index, row_list in enumerate(result_list[1:]):
        ## clean pub date
        # print(row_list[pubdate_index])
        # temp = (((row_list[pubdate_index] == '') * '0000') + ((row_list[pubdate_index] == '') * weird_dates.append(row_list)) + ((row_list[pubdate_index] != '') * row_list[pubdate_index]))
        # print(weird_dates)

        temp = row_list[pubdate_index]
        if temp == '':
            # print(temp)
            weird_dates.append(row_list)
            temp = '0000'
            continue
        # print(parse(temp).year)
        regex = re.compile('[^0-9]')
        cleaned = regex.sub('', temp)
        regex_pubdate_string = cleaned
        current_year = year

        ## split string into first 4 chars and last n chars (whatever is left)
        first_regex_pubdate_string = regex_pubdate_string[0:4]
        second_regex_pubdate_string = regex_pubdate_string[4:]
        if len(second_regex_pubdate_string) < 4:
            second_regex_pubdate_string = '0000'

        if first_regex_pubdate_string == '':
            # print(temp)
            weird_dates.append(row_list)
            first_regex_pubdate_string = '0000'
            continue

        if second_regex_pubdate_string == '':
            # print(temp)
            weird_dates.append(row_list)
            second_regex_pubdate_string = '0000'
            continue

        ## data is passed into a function which will return a string after comparing date values and adding the larger one to the row at the correct index

        ## need to find issue in date comparison. got large negative numbers for some reason

        # print(date_algo(row_list, first_regex_pubdate_string, second_regex_pubdate_string, current_year, pubdate_index))


        row_list.append(date_algo(row_list, first_regex_pubdate_string, second_regex_pubdate_string, current_year, pubdate_index))

        # if len(cleaned) == 4:
        #     row_list[pubdate_index] = int(cleaned)
        #     diff = year - row_list[pubdate_index]
        #     row_list.append(diff)
        # el
        # if len(cleaned) == 6:
        #     row_list[pubdate_index] = int(cleaned[0:4])
        #     diff = year - row_list[pubdate_index]
        #     row_list.append(diff)
        # el
        # if len(cleaned) == 8:
        #     date1 = int(cleaned[0:4])
        #     date2 = int(cleaned[4:8])
        #     if date1 > date2:
        #         row_list[pubdate_index] = date1
        #         diff = year - date1
        #         row_list.append(diff)
        #     elif date2 > date1:
        #         row_list[pubdate_index] = date2
        #         diff = year - date2
        #         row_list.append(diff)
        # else:
        #     print('caught: ' + str(row_list[pubdate_index]))
        #     weird_dates.append(row_list)
        #     continue

        ## withdrawn value
        if row_list[location] == 'General':
            row_list.append(0)
        elif row_list[location] == 'WITHDRAWN':
            row_list.append(1)

        if row_list[rec_date] == '':
            weird_dates.append(row_list)
            continue
        ## shelf time
        if row_list[last_loan] =='':
            date = datetime.datetime.strptime(row_list[rec_date], '%Y-%m-%dT%H:%M:%S')
        else:
            date = datetime.datetime.strptime(row_list[last_loan], '%Y-%m-%dT%H:%M:%S')
        row_list.append((now - date).days)

        ## check for duplicates based on perm call no
        if row_list[call_no] in unique_call:
            duplicates.append(row_list)
            continue
        else:
            unique_call.append(row_list[call_no])
            csv_wrapper.append(row_list)

    return csv_wrapper, duplicates, weird_dates

def date_algo(row_list, first_regex_pubdate_string, second_regex_pubdate_string, current_year, pubdate_index):
    ## compares the two regex cleaned strings and takes the larger one, adds it to the row at the correct index and then performs a calculation to give the data a static frame of reference at the point in time when it runs
    return ((first_regex_pubdate_string > second_regex_pubdate_string) * date1(row_list, first_regex_pubdate_string, current_year, pubdate_index)) + ((second_regex_pubdate_string > first_regex_pubdate_string) * date2(row_list, second_regex_pubdate_string, current_year, pubdate_index))

def date1(row_list, first_regex_pubdate_string, current_year, pubdate_index):
    row_list[pubdate_index] = int(first_regex_pubdate_string)
    static_reference_frame_date_difference = current_year - row_list[pubdate_index]
    return static_reference_frame_date_difference

def date2(row_list, second_regex_pubdate_string, current_year, pubdate_index):
    row_list[pubdate_index] = int(second_regex_pubdate_string)
    static_reference_frame_date_difference = current_year - row_list[pubdate_index]
    return static_reference_frame_date_difference

def get_indices(result_list):
    ## pubdate_index, call_no_index, last_loan_index, last_loan_in_index, last_loan_notin_index, location_index, rec_date_index
    start = time.time()
    print(time.time())
    # last_loan_in_index = result_list[0].index('Last Loan Date (in House)')
    # last_loan_notin_index = result_list[0].index('Last Loan Date (not In House)')
    # last_loan_index = result_list[0].index('Last Loan Date')
    # location_index = result_list[0].index('Location Name')
    # call_no_index = result_list[0].index('Permanent Call Number')
    # pubdate_index = result_list[0].index('Publication Date')
    # rec_date_index = result_list[0].index('Receiving Date And Time')
    for index, heading in enumerate(result_list[0]):
        if heading.lower() == 'last loan date (in house)':
            last_loan_in_index = index
        elif heading.lower() == 'last loan date (not in house)':
            last_loan_notin_index = index
        elif heading.lower() == 'last loan date':
            last_loan_index = index
        elif heading.lower() == 'location name':
            location_index = index
        elif heading.lower() == 'permanent call number':
            call_no_index = index
        elif heading.lower() == 'publication date':
            pubdate_index = index
        elif heading.lower() == 'receiving date and time':
            rec_date_index = index
    end = time.time()
    print(end)
    print(start)
    print(end-start)
    return pubdate_index, call_no_index, last_loan_index, last_loan_in_index, last_loan_notin_index, location_index, rec_date_index