Add command for loading data from csv

Add minor test data

Add command for loading data from csv
6ddd9580 · Odin Johan Vatne · dd27aaf6 · 6ddd9580 · 6ddd9580 · 6ddd9580
Commit 6ddd9580 authored 3 years ago by Odin Johan Vatne
--- a/pasapp/management/commands/_readcsv.py
+++ b/pasapp/management/commands/_readcsv.py
+from datetime import datetime, timedelta, timezone, tzinfo
+import logging
+from django.core.exceptions import *
+import csv
+import os
+from distutils.util import strtobool
+from contextlib import contextmanager
+from pasapp.models import ProjectTag, TagCategory, Tag, Project, Application
+from pasapp.forms import NewUserForm
+from django.contrib.auth.models import User
+
+to_bool = lambda str: bool(strtobool(str))
+
+short_map = {'w': 'weeks', 'd': 'days', 'h': 'hours', 'm': 'minutes'}
+cest_tz = timezone(timedelta(hours=+5), 'CEST')
+
+def read_csv(input_folder):
+    tag_category_map = {}
+    tag_map = {}
+    project_map = {}
+    application_map = {}
+
+    output_stats = {}
+
+    def stats_add(key, value):
+        if key in output_stats:
+            output_stats[key] += value
+        else:
+            output_stats[key] = value
+    
+    def get_or_throw(map, key):
+        if key in map:
+            return map[key]
+        raise ObjectDoesNotExist
+    
+    def get_params(keys, row, skipped=()):
+        output = {}
+        for key, col in keys:
+            if key == 'ref_id': continue
+            if key in skipped: continue
+            if key.startswith('f_') or key.startswith('ff_'): continue # foreign keys, case-by-case handling
+            value = row[col]
+            if value == '': continue # uses model default, use ' ' if a blank string is needed
+            if key.startswith('b_'): # boolean fields
+                key = key[2:]
+                value = to_bool(value)
+            elif key.startswith('i_'): # integer fields
+                key = key[2:]
+                value = int(value)
+            elif key.startswith('d_'): # datetime fields
+                key = key[2:]
+                if value.startswith('n-'): # shorthand for datetimes in relative past to when the data was populated
+                    # (n)ow minus: (w)eeks, (d)ays, (h)ours, (m)inutes: n-1w2h25m
+                    args = {}
+                    n = ''
+                    for c in value[2:]:
+                        if c in '0123456789':
+                            n+=c
+                        elif c in short_map:
+                            args[short_map[c]] = int(n)
+                            n = ''
+                    value = datetime.now(tz=cest_tz) - timedelta(**args)
+                else:
+                    # 1998.11.21 14:46:55 -> 1998, 11, 21, 14, 46, 55
+                    value = datetime(*[int(n) for n in value.replace('.', ':').replace(' ', ':').split(':')], tzinfo=cest_tz)
+            output[key] = value
+        return output
+
+    def read_tag_categories(file):
+        # ref_id, category (name), (b_academic)
+        col_map = {}
+        created_cnt = 0
+        failed_cnt = 0
+        existing_cnt = 0
+        error_cnt = 0
+        def cnt_increment():
+            nonlocal error_cnt
+            error_cnt += 1
+        with open(file) as f:
+            reader = csv.reader(f)
+            error_logger = ErrorLogger('tagcategories', cnt_increment)
+
+            for row_num, row in enumerate(reader):
+                if row_num == 0:
+                    for i, val in enumerate(row):
+                        col_map[val] = i
+                    if 'ref_id' not in col_map:
+                        logging.error(f"'tagcategories.csv' is missing necessary column 'ref_id'")
+                        return
+                else:
+                    ref_id = row[col_map['ref_id']]
+                    params = get_params(col_map.items(), row)
+                    with error_logger.handle(row_num):
+                        retrieved, created = TagCategory.objects.get_or_create(**params)
+                        if retrieved:
+                            tag_category_map[ref_id] = retrieved.pk
+                            if created:
+                                created_cnt += 1
+                            else:
+                                existing_cnt +=1
+                        else:
+                            logging.warning(f"Could not create TagCategory {params['category']}, ref_id {ref_id}. Found at line {row_num} of 'tagcategories.csv'.")
+                            failed_cnt += 1
+        if created_cnt > 0: stats_add('Tag Categories created', created_cnt)
+        if existing_cnt > 0: stats_add('Tag Categories already existing', existing_cnt)
+        if failed_cnt > 0: stats_add('Tag Categories failed', failed_cnt+error_cnt)
+        if error_cnt > 0: stats_add('Errors occured while creating Tag Categories', error_cnt)
+
+    def read_tags(file):
+        # (ref_id), name, f_category
+        col_map = {}
+        created_cnt = 0
+        failed_cnt = 0
+        existing_cnt = 0
+        error_cnt = 0
+        def cnt_increment():
+            nonlocal error_cnt
+            error_cnt += 1
+        with open(file) as f:
+            reader = csv.reader(f)
+            error_logger = ErrorLogger('tags', cnt_increment)
+
+            for row_num, row in enumerate(reader):
+                if row_num == 0:
+                    for i, val in enumerate(row):
+                        col_map[val] = i
+                    for key in ('name', 'f_category'):
+                        if key not in col_map:
+                            logging.error(f"'tags.csv' is missing necessary column '{key}'")
+                            return
+                else:
+                    ref_id = row[col_map['ref_id']] if 'ref_id' in col_map else False
+                    name = row[col_map['name']]
+                    with error_logger.handle(row_num):
+                        category_id = get_or_throw(tag_category_map, row[col_map['f_category']])
+                        retrieved, created = Tag.objects.get_or_create(name=name, category_id=category_id)
+                        if retrieved:
+                            if ref_id:
+                                tag_map[ref_id] = retrieved.pk
+                            if created:
+                                created_cnt += 1
+                            else:
+                                existing_cnt +=1
+                        else:
+                            logging.warning(f"Could not create Tag {name}, ref_id {ref_id}. Found at line {row_num} of 'tags.csv'.")
+                            failed_cnt += 1
+        if created_cnt > 0: stats_add('Tags created', created_cnt)
+        if existing_cnt > 0: stats_add('Tags already existing', existing_cnt)
+        if failed_cnt > 0: stats_add('Tags failed', failed_cnt+error_cnt)
+        if error_cnt > 0: stats_add('Errors occured while creating tags', error_cnt)
+    
+    def read_users(file):
+        # username, email, (first_name, last_name, b_isProfessor), password -> 'password'
+        col_map = {}
+        created_cnt = 0
+        failed_cnt = 0
+        existing_cnt = 0
+        with open(file) as f:
+            reader = csv.reader(f)
+            for row_num, row in enumerate(reader):
+                if row_num == 0:
+                    for i, val in enumerate(row):
+                        col_map[val] = i
+                    for key in ('username', 'email'):
+                        if key not in col_map:
+                            logging.error(f"'tags.csv' is missing necessary column '{key}'")
+                            return
+                else:
+                    form_args = get_params(col_map.items(), row, ('password',))
+                    form_args['password1'] = form_args['password2'] = 'password'
+                    
+                    try:
+                        existing_user = User.objects.get(username=form_args['username'])
+                        existing_cnt += 1
+                    except:
+                        form = NewUserForm(form_args)
+                        if form.is_valid():
+                            form.save()
+                            created_cnt += 1
+                        else:
+                            logging.warning(f"Could not create User {row[col_map['username']]}, found at line {row_num} of 'users.csv':")
+                            logging.warning(form.errors)
+                            failed_cnt += 1
+        if created_cnt > 0: stats_add('Users created', created_cnt)
+        if existing_cnt > 0: stats_add('Users already existing', existing_cnt)
+        if failed_cnt > 0: stats_add('Users failed', failed_cnt)
+
+    def read_projects(file):
+        # (ref_id), title, f_professor, ff_tags, (description, status, b_hidden, d_date_created, d_last_updated)
+        col_map = {}
+        created_cnt = 0
+        failed_cnt = 0
+        existing_cnt = 0
+        error_cnt = 0
+        created_tag_cnt = 0
+        existing_tag_cnt = 0
+        failed_tag_cnt = 0
+        error_tag_cnt = 0
+        def cnt_increment():
+            nonlocal error_cnt
+            error_cnt += 1
+        def tag_cnt_increment():
+            nonlocal error_tag_cnt
+            error_tag_cnt += 1
+        with open(file) as f:
+            reader = csv.reader(f)
+            error_logger = ErrorLogger('projects', cnt_increment)
+            tag_error_logger = ErrorLogger('projectTags', tag_cnt_increment)
+
+            for row_num, row in enumerate(reader):
+                if row_num == 0:
+                    for i, val in enumerate(row):
+                        col_map[val] = i
+                    for key in ('f_professor', 'title', 'status'):
+                        if key not in col_map:
+                            logging.error(f"'tags.csv' is missing necessary column '{key}'")
+                            return
+                else:
+                    with error_logger.handle(row_num):
+                        ref_id = row[col_map['ref_id']] if 'ref_id' in col_map else None
+                        professor = User.objects.get(username=row[col_map['f_professor']])
+                        params = get_params(col_map.items(), row)
+                        copies = Project.objects.filter(title=row[col_map['title']], professor=professor, status=row[col_map['status']])
+                        if len(copies) > 0:
+                            retrieved = copies[0]
+                            created = False
+                        else:
+                            retrieved, created = Project.objects.get_or_create(professor=professor, **params)
+                            # will never get, only create, due to no unique fields
+                        if retrieved:
+                            if ref_id:
+                                project_map[ref_id] = retrieved.pk
+                            if created:
+                                created_cnt += 1
+                            else:
+                                existing_cnt += 1
+                            if 'ff_tags' in col_map:
+                                tags_ref_list = row[col_map['ff_tags']].split(';')
+                                for tag_ref in tags_ref_list:
+                                    with tag_error_logger.handle(row_num):
+                                        tag_id = get_or_throw(tag_map, tag_ref)
+                                        project_tag, created_tag = ProjectTag.objects.get_or_create(project=retrieved, tag_id=tag_id)
+                                        if not project_tag:
+                                            failed_tag_cnt += 1
+                                            continue
+                                        if created_tag:
+                                            created_tag_cnt += 1
+                                        else:
+                                            existing_tag_cnt += 1
+                        else:
+                            logging.warning(f"Could not create Project with ref_id {ref_id}. Found at line {row_num} of projects.csv.")
+                            failed_cnt += 1
+        if created_cnt > 0: stats_add('Projects created', created_cnt)
+        if existing_cnt > 0: stats_add('Projects already existing', existing_cnt)
+        if failed_cnt > 0: stats_add('Projects failed', failed_cnt+error_cnt)
+        if error_cnt > 0: stats_add('Errors occured while creating Projects', error_cnt)
+        if created_tag_cnt > 0: stats_add('ProjectTags created', created_tag_cnt)
+        if existing_tag_cnt > 0: stats_add('ProjectTags already existing', existing_tag_cnt)
+        if failed_tag_cnt > 0: stats_add('ProjectTags failed', failed_tag_cnt)
+        if error_tag_cnt > 0: stats_add('Errors occured while creating ProjectTags', error_tag_cnt)
+
+    def read_applications(file):
+        # (ref_id), f_project, f_student, (i_priority, message, professor_status, student_status, d_date_created, d_last_updated)
+        col_map = {}
+        created_cnt = 0
+        failed_cnt = 0
+        existing_cnt = 0
+        error_cnt = 0
+        def cnt_increment():
+            nonlocal error_cnt
+            error_cnt += 1
+        with open(file) as f:
+            reader = csv.reader(f)
+            error_logger = ErrorLogger('applications', cnt_increment)
+
+            for row_num, row in enumerate(reader):
+                if row_num == 0:
+                    for i, val in enumerate(row):
+                        col_map[val] = i
+                    for key in ('f_project', 'f_student'):
+                        if key not in col_map:
+                            logging.error(f"'tags.csv' is missing necessary column '{key}'")
+                            return
+                else:
+                    with error_logger.handle(row_num):
+                        ref_id = row[col_map['ref_id']] if 'ref_id' in col_map else None
+                        project_id = get_or_throw(project_map, row[col_map['f_project']]) #Project.objects.get(id=row[col_map['f_project']])
+                        student = User.objects.get(username=row[col_map['f_student']])
+                        params = get_params(col_map.items(), row)
+                        retrieved, created = Application.objects.get_or_create(project_id=project_id, student=student, **params)
+                        if retrieved:
+                            if ref_id:
+                                application_map[ref_id] = retrieved.pk
+                            if created:
+                                created_cnt += 1
+                            else:
+                                existing_cnt +=1
+                        else:
+                            logging.warning(f"Could not create Application by {row[col_map['f_student']]} on project with refID '{row[col_map['f_project']]}'. Found at line {row_num} of applications.csv.")
+                            failed_cnt += 1
+        if created_cnt > 0: stats_add('Applications created', created_cnt)
+        if existing_cnt > 0: stats_add('Applications already existing', existing_cnt)
+        if failed_cnt > 0: stats_add('Applications failed', failed_cnt+error_cnt)
+        if error_cnt > 0: stats_add('Errors occured while creating Applications', error_cnt)
+
+    deferred_functions = {'tagcategories': read_tag_categories,
+                          'tags': read_tags,
+                          'users': read_users,
+                          'projects': read_projects,
+                          'applications': read_applications}
+    
+    files = [os.path.join(input_folder, file) for file in os.listdir(input_folder)]
+
+    validfiles = [file for file in files if os.path.isfile(file) and file.endswith(".csv")]
+    for modeltype in ['tagcategories', 'tags', 'users', 'projects', 'applications']:
+        modelfile = None
+        for file in validfiles:
+            if file.endswith(modeltype + ".csv"):
+                modelfile = file
+                break
+        
+        if modelfile is None:
+            logging.warning(f'No file found for {modeltype}.')
+            continue
+        
+        deferred_functions[modeltype](modelfile)
+    
+    for title, value in output_stats.items():
+        print(f'{title}: {value}.')
+
+    return
+
+class ErrorLogger():
+    def __init__(self, filename, counter_func):
+        self.counter_func = counter_func
+        self.filename = f"'{filename}.csv'"
+
+    @contextmanager
+    def handle(self, row_num, ref_id=None):
+        try:
+            error_raised = True
+            if ref_id is None:
+                position = f"model at line {row_num} in {self.filename}"
+            else:
+                position = f"refID '{ref_id}' in a field of the model at line {row_num} in {self.filename}"
+            yield
+        except FieldDoesNotExist:
+            logging.error(f"One or more of the fields specified in the header of {self.filename} were not defined in the model.")
+        except FieldError:
+            logging.error(f"One or more of the fields specified in the header of {self.filename} were not defined in the model.")
+        except ValidationError:
+            logging.error(f"The {position} failed to validate. This is likely a missing foreign key issue.")
+        except ObjectDoesNotExist as e:
+            if ref_id is None:
+                logging.error(f"A foreign key or refID at line {row_num} in {self.filename} was not found.")
+                logging.error(e)
+            else:
+                logging.error(f"The refID '{ref_id}' at line {row_num} in {self.filename} was not found.")
+        except Exception as e:
+            logging.error(f"The {position} could not be imported.")
+            logging.error(e)
+        else:
+            error_raised = False
+        finally:
+            if error_raised:
+                self.counter_func()
+
+
+
+# with open(modelfile) as f:
+#     reader = csv.reader(f)
+#     col_map = {}
+#     for row_num, row in enumerate(reader):
+#         if row_num == 0:
+#             for i, val in enumerate(row):
+#                 col_map[i] = val
+#         else:
+#             model_args = {col_map[i]: value for i, value in enumerate(row)}
+
+# model_args = {col_map[i]: value for i, value in enumerate(row)}
+# try:
+#     gotten, created = model.get_or_create(**model_args)
+#     if gotten:
+#         Logger.warn(f"Model at line {str(row_num)} already exists.")
+#     elif not created:
+#         Logger.warn(f"Could not create the model specified at line {str(row_num)}.")
+# except FieldDoesNotExist:
+#     Logger.error("One or more of the fields specified in the csv file's header were not defined in the model.")
+# except FieldError:
+#     Logger.error("One or more of the fields specified in the csv file's header were not defined in the model.")
+# except ValidationError:
+#     Logger.error(f"The model at line {str(row_num)} failed to validate.")
+# except:
+#     Logger.error(f"The model at line {str(row_num)} could not be imported.")
\ No newline at end of file
--- a/pasapp/management/commands/loadcsvdata.py
+++ b/pasapp/management/commands/loadcsvdata.py
+from django.core.management.base import BaseCommand
+from django.apps import apps
+from ._readcsv import read_csv
+import os
+
+def dir_path(string):
+    if os.path.isdir(string):
+        return string
+    else:
+        raise NotADirectoryError(string)
+
+class Command(BaseCommand):
+    help = 'Loads a test dataset into the database from a folder.'
+
+    def add_arguments(self, parser):
+        parser.add_argument('folder_path', nargs=1, type=dir_path)
+
+    def handle(self, *args, **options):
+        directory = options['folder_path'][0]
+        if directory:
+            read_csv(os.path.normpath(directory))
--- a/test_data/base_dataset/applications.csv
+++ b/test_data/base_dataset/applications.csv
+ref_id,f_project,f_student,i_priority,professor_status,student_status,d_date_created,d_last_updated,message
+,tree_surv,emile,,,,,,"I think trees are really neat"
\ No newline at end of file
--- a/test_data/base_dataset/projects.csv
+++ b/test_data/base_dataset/projects.csv
+ref_id,title,f_professor,status,b_hidden,d_date_created,d_last_updated,ff_tags,description
+tree_surv,Noninvasive devices for long-term pine tree surveys,frankf,Open,no,n-1d,,size_1;size_g;comm_g,"The project will involve creating a prototype sensor device for long term survey of norwegian pine trees, in collaboration with the Norwegian Forestry and Wildlife Department",
\ No newline at end of file
--- a/test_data/base_dataset/tagcategories.csv
+++ b/test_data/base_dataset/tagcategories.csv
+ref_id,category,b_academic
+priv,Privacy Level,true
+size,Group Size,false
+comm,Customer Type,false
\ No newline at end of file
--- a/test_data/base_dataset/tags.csv
+++ b/test_data/base_dataset/tags.csv
+ref_id,name,f_category
+priv_sens,Sensitive,priv
+priv_conf,Confidential,priv
+size_1,Solo,size
+size_g,Group,size
+size_d,Duo,size
+comm_u,University Project,comm
+comm_g,Government Project,comm
+comm_p,Third Party Project,comm
\ No newline at end of file
--- a/test_data/base_dataset/users.csv
+++ b/test_data/base_dataset/users.csv
+username,email,first_name,last_name,b_isProfessor
+emile,emile@stud.ntnu.no,Emil,Ebert,no
+frankf,frankf@ntnu.no,Frank,Farenheit,yes
\ No newline at end of file