Source code for pygenprop.assignment_file_parser
#!/usr/bin/env python
"""
Created by: Lee Bergstrand (2017)
Description: A parser for parsing genome properties longform files.
"""
import csv
from os.path import basename, splitext
from pygenprop.assign import AssignmentCache
[docs]def parse_genome_property_longform_file(longform_file):
"""
Parses longform genome properties assignment files.
:param longform_file: A longform genome properties assignment file handle object.
:return: An assignment cache object.
"""
property_id = ''
step_number = ''
assignment_cache = AssignmentCache(sample_name=splitext(basename(longform_file.name))[0])
for line in longform_file:
if 'PROPERTY:' in line:
property_id = line.split(':')[1].strip()
elif 'STEP NUMBER:' in line:
step_number = int(line.split(':')[1].strip())
elif 'RESULT:' in line:
assignment = line.split(':')[1].strip().upper()
if 'STEP' in line:
assignment_cache.cache_step_assignment(property_id, step_number, assignment)
else:
assignment_cache.cache_property_assignment(property_id, assignment)
else:
continue
return assignment_cache
[docs]def parse_interproscan_file(interproscan_file):
"""
Parses InterProScan TSV files into an assignment cache.
:param interproscan_file: A InterProScan file handle object.
:return: An assignment cache object.
"""
identifiers = []
tsv_reader = csv.reader(interproscan_file, delimiter='\t')
for row in tsv_reader:
matched_interpro_member_database_id = row[4]
identifiers.append(matched_interpro_member_database_id)
return AssignmentCache(interpro_member_database_identifiers=identifiers,
sample_name=splitext(basename(interproscan_file.name))[0])