Program to assist librarians with weeding by making predictions based on past decision data and integrating librarian-approved predictions into the data set
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

167 lines
7.0 KiB

import email, smtplib, os.path
from exchangelib import Credentials, Account, DELEGATE, Configuration, FileAttachment, ItemAttachment, Message, CalendarItem, HTMLBody
from cont_modules import *
from creds import creds
from config1 import acc, config
# test again
def sendmail(attachment_filename, email_target):
## currently takes a file name to be attached, finds the file in the local folder, attaches it to a new email and sends
body = 'test email'
item = Message(account=acc, subject='Library weeding', body=body, to_recipients=[email_target])
binary_file_content = open('./' + attachment_filename, 'rb').read()
attachment = FileAttachment(name=attachment_filename, content=binary_file_content)
item.attach(attachment)
item.send()
def checkmail(collection_code):
## currently takes an attachment file name, finds the first unread message with that file attached, and downloads it to the local folder
## currently working for one single email and the first attachment only, message must be marked as unread
# fields = [f.name for f in Message.FIELDS if f.is_searchable]
# print(fields)
for index, item in enumerate(acc.inbox.all().iterator()):
if item.is_read == False and len(item.attachments) >= 1:
# print(item.attachments[0].content)
if item.attachments[0].name == construct_ID('predictions', collection_code):
with open('./%s' % construct_ID('librarian_decisions', collection_code), 'wb') as f:
f.write(item.attachments[0].content)
item.is_read = True
item.save()
break
if index >= 64:
break
return construct_ID('librarian_decisions', collection_code)
def check_if_new(item):
if item.is_read == False and len(item.attachments) >= 1:
return True
else:
return False
def get_attachment(attachment):
## file name reading here needs to be adjusted, currently looks for a specific file. it needs to be able to read the file name and create a new file based on the code in the file name
# collection_code = attachment.name[11:21]
date, collection_code, step = read_file_name(attachment.name)
if 'predictions.csv' in attachment.name:
# print(attachment.content)
return attachment.content, collection_code
def read_file_name(string_file_name):
## need to dynamically find collection code, date, and possibly step name
date = string_file_name[0:10]
collection = re.match('^[A-Z]{1,2}_[-A-Z]{2,12}', string_file_name[11:])
step_start = len(date) + len(collection[0]) + 2
step_end = len(string_file_name) - 4
step = string_file_name[step_start:step_end]
return date, collection, step
def loop_inbox():
## needs functionality to check for duplicates
for index, item in enumerate(acc.inbox.all().iterator()):
if check_if_new(item):
## only grabbing first attachment for now
content, collection = get_attachment(item.attachments[0])
write_attachment(content, collection)
item.is_read = True
item.save()
attachment = read_csv_to_list_wrapper('./data/decisions/%s' % construct_ID('librarian_decisions', collection))
# csv_reader = csv.reader(attachment)
pred_index, dec_index = get_decision_and_prediction_index(attachment)
print(pred_index)
# translated = translate_csv_reader(attachment, pred_index)
# translated = translate_csv_reader(translated, dec_index)
# decision_list = compare_decisions(translated)
# integrate_into_training_set(decision_list)
# write_csv_from_list('data/decisions/%s' % construct_ID('librarian_final', collection), decision_list)
# translated_list = translate_final(decision_list)
# write_csv_from_list('data/decisions/%s' % construct_ID('librarian_translated_final', collection), translated_list)
if index >= 128:
## gotta stop sometime, this only checks up to the nth email
break
def write_attachment(content, collection_code):
with open('./data/decisions/%s' % construct_ID('librarian_decisions', collection_code), 'wb') as f:
f.write(content)
def compare_decisions(csv_list):
pred_index, dec_index = get_decision_and_prediction_index(csv_list)
csv_list[0].append('final_decision')
for row in csv_list[1:]:
row.append(((not int(row[pred_index]))*(row[dec_index] == '0'))+((int(row[pred_index]))*(row[dec_index] == '1')))
# print(row)
# if row[dec_index] == '0':
# print()
# row[pred_index] = not row[pred_index]
return csv_list
def get_decision_and_prediction_index(result_list):
## prediction and decision
pred = result_list[0].index('Remove from collection')
dec = result_list[0].index('Do you agree?')
# for index, heading in enumerate(result_list[0]):
# if heading == 'Remove from collection':
# pred = index
# elif heading == 'Do you agree?':
# dec = index
return pred, dec
def translate_final(csv_reader):
## translates a csv.reader from human readable 'yes', 'no' to '0' and '1'. takes column number to interpolate 'yes'/'1' for no answer
wrapper_list = []
column_index = len(csv_reader[0]) - 1
for row in csv_reader:
wrapper_list.append(row)
header = wrapper_list[0]
translated = []
# print(row[column_index])
# print(header[column_index])
for row in wrapper_list:
if row[column_index] != header[column_index]:
## branchless function to detect a 0 or 1 and pass the correct string using boolean multiplication
row[column_index] = (row[column_index] == 0)*('keep')+(row[column_index] == 1)*('remove')
# print('fff')
# if (row[column_index] == 0):
# row[column_index] = 'keep'
# elif row[column_index] == 1:
# row[column_index] = 'remove'
# row = [item.replace('keep', 0) for item in row]
# row = [item.replace('remove', 1) for item in row]
translated.append(row)
return translated
def integrate_into_training_set(csv_wrapper):
## training_set = read_csv_to_list_wrapper('./data/training_data/global_training_data.sv')
## temp training set
training_set = read_csv_to_list_wrapper('./data/training_data/training_ALL_api_out.csv')
final_index = csv_wrapper[0].index('final_decision')
withdrawn_index_dec = csv_wrapper[0].index('Withdrawn')
prediction_index = csv_wrapper[0].index('Remove from collection')
decision_index = csv_wrapper[0].index('Do you agree?')
withdrawn_index_train = training_set[0].index('Withdrawn')
for row in csv_wrapper[1:]:
row[withdrawn_index_dec] = row[final_index]
row.pop(final_index)
row.pop(decision_index)
row.pop(prediction_index)
if withdrawn_index_train == withdrawn_index_dec:
training_set.append(row)
else:
print('mismatched Withdrawn indices')