import os
import h5py
import numpy
import pandas
import json
import gzip
import shlex
import subprocess
import fasteners
import warnings
import shutil
import random
import string
from os.path import join,split,exists
from werkzeug.utils import secure_filename
from shutil import copytree,ignore_patterns,copyfile
[docs]
datatype_mappings={
"int64":"integer",
"float64":"double",
"float32":"double",
"object":"text",
"category":"text",
"bool":"text",
"int32":"double"
}
[docs]
numpy_dtypes={
"text":numpy.ubyte,
"multitext":numpy.uint16,
"double":numpy.float32,
"integer":numpy.float32,
"int32":numpy.int32
#unique created in fly (depends on string length)
}
[docs]
class MDVProject:
def __init__(self,dir,delete_existing=False):
self.dir=dir
if delete_existing and exists(dir):
shutil.rmtree(dir)
self.h5file = join(dir,"datafile.h5")
self.datasourcesfile= join(dir,"datasources.json")
self.statefile= join(dir,"state.json")
self.viewsfile= join(dir,"views.json")
self.imagefolder = join(dir,"images")
self.trackfolder = join(dir,"tracks")
if not exists(dir):
os.mkdir(dir)
if not exists(self.trackfolder):
os.mkdir(self.trackfolder)
if not exists(self.datasourcesfile):
with open(self.datasourcesfile,"w") as o:
o.write(json.dumps([]))
if not exists(self.viewsfile):
with open(self.viewsfile,"w") as o:
o.write(json.dumps({}))
o.close()
if not exists(self.statefile):
with open(self.statefile,"w") as o:
o.write(json.dumps({
"all_views":[],
"popouturl":"popout.html"
}))
self._lock = fasteners.InterProcessReaderWriterLock(join(dir,"lock"))
@property
[docs]
def datasources(self):
return get_json(self.datasourcesfile)
@datasources.setter
def datasources(self,value):
save_json(self.datasourcesfile,value)
@property
[docs]
def views(self):
return get_json(self.viewsfile)
@views.setter
def views(self,value):
save_json(self.viewsfile,value)
@property
[docs]
def state(self):
return get_json(self.statefile)
@state.setter
def state(self,value):
save_json(self.statefile,value)
[docs]
def set_editable(self,edit):
c= self.state
c["permission"] = "edit" if edit else "view"
self.state=c
[docs]
def lock(self,type="read"):
return self._lock.read_lock() if type=="read" else self._lock.write_lock()
[docs]
def get_datasource_as_dataframe(self,datasource):
ds= self.get_datasource_metadata(datasource)
df = pandas.DataFrame()
for c in ds["columns"]:
data = self.get_column(datasource,c["field"])
df[c["name"]]=data
return df
[docs]
def check_columns_exist(self,datasource,columns):
md = self.get_datasource_metadata(datasource)
all_cols = set([x["field"] for x in md["columns"]])
return [x for x in columns if not x in all_cols]
[docs]
def set_interactions(self,interaction_ds,parent_ds,
pivot_column="sample_id",
parent_column="annotation",
is_single_region=True,
interaction_columns=["Cell Type 1","Cell Type 2"],
default_parameter="Cross PCF gr20",
node_size="cell 1 number",
add_view=True):
#check columns exist in the appropriate data sets
missing_cols= self.check_columns_exist(interaction_ds,[pivot_column,default_parameter,node_size]+interaction_columns )
if len(missing_cols)>0:
raise AttributeError(f'columns {",".join(missing_cols)} not found in {interaction_ds} datasource')
missing_cols= self.check_columns_exist(parent_ds,[pivot_column,parent_column])
if len(missing_cols)>0:
raise AttributeError(f'columns {",".join(missing_cols)} not found in {parent_ds} datasource')
#update the config
md = self.get_datasource_metadata(interaction_ds)
md["interactions"]={
"pivot_column":pivot_column,
"is_single_region":is_single_region,
"interaction_columns":interaction_columns,
"spatial_connectivity_map":{
"link_length": default_parameter,
"link_thickness":default_parameter,
"link_color": default_parameter,
"node_size": node_size
},
"cell_radial_chart":{"link_thickness":default_parameter}
}
self.set_datasource_metadata(md)
#update the links between datasources
self.insert_link(interaction_ds,parent_ds,"interactions",
{
"interaction_columns":interaction_columns+[parent_column],
"pivot_column":pivot_column,
"is_single_region":is_single_region
})
if add_view:
#todo add stuff to the view
self.set_view(interaction_ds,{
"initialCharts":{
parent_ds:[],
interaction_ds:[]
}
})
[docs]
def _get_h5_handle(self,read_only=False):
mode = "r"
if not exists(self.h5file):
mode="w"
elif not read_only:
mode="a"
return h5py.File(self.h5file,mode)
[docs]
def get_column(self,datasource,column,raw=False):
cm = self.get_column_metadata(datasource,column)
h5 = self._get_h5_handle()
raw_data = numpy.array(h5[datasource][column])
if raw:
return raw_data
dt = cm["datatype"]
if dt == "text":
data= [cm["values"][x] for x in raw_data]
elif dt == "multitext":
chunksize = raw_data.shape[0]/cm["stringLength"]
arr = numpy.split(raw_data,chunksize)
data = [",".join([cm["values"][x]for x in y if x != 65535]) for y in arr]
elif dt == "unique":
data = [x.decode() for x in raw_data]
else:
data = list(raw_data)
h5.close()
return data
[docs]
def set_column_with_raw_data(self,datasource,column,raw_data):
'''Adds or updates a column with raw data
Args:
datasource (str): The name of the datasource.
column (dict): The complete metadata for the column
raw_data (list|array): The raw binary data for the column
'''
h5= self._get_h5_handle()
cid= column["field"]
if h5[datasource].get(cid):
del h5[datasource][cid]
dt = numpy_dtypes.get(column["datatype"])
if not dt:
dt =h5py.string_dtype('utf-8',column["stringLength"])
h5[datasource].create_dataset(cid,len(raw_data),data = raw_data,dtype=dt)
ds =self.get_datasource_metadata(datasource)
cols= ds["columns"]
ind = [c for c,x in enumerate(cols) if x["field"]==cid]
if len(ind)==0:
cols.append(column)
else:
cols[ind[0]]=column
self.set_datasource_metadata(ds)
[docs]
def set_column(self,datasource,column,data):
'''Adds (or replaces an existing column) with the data supplied
Args:
datasource (str): The name of the datasource.
column (str|dict): metadata for the column. Can be a string with the column's name,
although datatype should also be included as the inferred datatype
is not always correct
raw_data (list|array): Anything that can be converted into a pandas Series
The data should be in the correct order
'''
if type(column) == str:
column={"name":column}
if not column.get("field"):
column["field"]=column["name"]
ds= self.get_datasource_metadata(datasource)
ind = [c for c,x in enumerate(ds["columns"]) if x["field"]==column["field"]]
col_exists= len(ind)>0
li = pandas.Series(data)
if not column.get("datatype"):
column["datatype"]= datatype_mappings.get(str(li.dtype),"text")
h5 = self._get_h5_handle()
gr = h5[datasource]
if h5[datasource].get(column["field"]):
del h5[datasource][column["field"]]
add_column_to_group(column,li,gr,len(li))
h5.close()
if col_exists:
ds["columns"][ind[0]]=column
else:
ds["columns"].append(column)
self.set_datasource_metadata(ds)
[docs]
def remove_column(self,datasource,column):
'''Removes the specified column
Args:
datasource (str): The name of the data source.
column (str): The id (field) of the column.
'''
ds = self.get_datasource_metadata(datasource)
cols = [x for x in ds["columns"] if x["field"] != column]
if len(cols)==len(ds["columns"]):
warnings.warn(f"deleting non existing column: {column} from {datasource}")
return
ds["columns"]=cols
h5= self._get_h5_handle()
del h5[datasource][column]
self.set_datasource_metadata(ds)
[docs]
def add_annotations(self,datasource,data,separator="\t",missing_value="ND",columns=None,
supplied_columns_only=False):
'''Adds annotations based on an existing column
Args:
datasource (str): The name of the data source.
data (dataframe|str): Either a pandas dataframe or a text file. The first column
should be the 'index' column and match a column in the datasource. The other columns should
contain the annotations to add.
separator (str,optional): The delimiter if a text file is supplied (tab by default)
missing_value(str,optional): The value to put if the index value is missing in the input data.
Default is 'ND'
'''
if type(data) == str:
data= pandas.read_csv(data,sep=separator)
ds = self.get_datasource_metadata(datasource)
index_col = data.columns[0]
data=data.set_index(index_col)
columns= get_column_info(columns,data,supplied_columns_only)
col = [x for x in ds["columns"] if x["field"]==index_col]
if len(col)==0:
raise AttributeError(f'index column {index_col} not found in {datasource} datasource')
newdf= pandas.DataFrame({index_col:self.get_column(datasource,index_col)})
h5 = self._get_h5_handle()
gr = h5[datasource]
for c in columns:
d= {k:v for k,v in zip(data.index,data[c["field"]])}
#v slow - needs improving
ncol = newdf.apply(lambda row:d.get(row[0],missing_value),axis=1)
add_column_to_group(c,ncol,gr,len(ncol))
ds["columns"].append(c)
self.set_datasource_metadata(ds)
h5.close()
[docs]
def set_column_group(self,datasource,groupname,columns):
'''Adds (or changes) a column group
Args:
datasource(string): The name of the datasource
groupname(string): The name of the column group
columns(list): The field names of columns in the group. If None, then the column
group will be removed
'''
ds= self.get_datasource_metadata(datasource)
#check if columns exists
if columns:
colfields= set([x["field"] for x in ds["columns"]])
missingcols= [x for x in columns if x not in colfields]
if len(missingcols)>0:
raise AttributeError(f"adding non existent columns ({','.join(missingcols)}) to column group {groupname}\
in datasource {datasource}")
cg = ds.get("columnGroups")
#create entry if absent
if not cg:
cg=[]
ds["columnGroups"]=cg
#does group exist
ind = [c for c,x in enumerate(cg) if x["name"]==groupname]
#change (or delete) existing group
if len(ind)==1:
if columns:
cg[ind[0]]["columns"]=columns
else:
del cg[ind[0]]
#add new group
else:
#no group to delete
if not columns:
raise AttributeError(f"removing non existent column group {groupname}\
from datasource {datasource}")
#add new group
cg.append({
"name":groupname,
"columns":columns
})
self.set_datasource_metadata(ds)
[docs]
def delete_datasource(self,name,delete_views=True):
h5 = self._get_h5_handle()
del h5[name]
h5.close()
self.datasources = [x for x in self.datasources if x["name"] !=name]
#delete all views contining that datasource
if delete_views:
views = self.views
for view in views:
data= views[view]
if data["initialCharts"].get(name):
self.set_view(view,None)
[docs]
def add_genome_browser(self,datasource,parameters=["chr","start","end"],name=None):
# get all the genome locations
loc = [self.get_column(datasource,x) for x in parameters]
#write to a bed file
bed = join(self.trackfolder,"t.bed")
o=open(bed,"w")
for c,(chr,start,end) in enumerate(zip(loc[0],loc[1],loc[2])):
o.write(f"{chr}\t{start}\t{end}\t{c}\n")
o.close()
indexed_bed= join(self.trackfolder,"loc.bed")
create_bed_gz_file(bed,indexed_bed)
os.remove(bed)
if not name:
name = datasource
gb={
"location_fields":parameters,
"default_track":{
"url":"tracks/loc.bed.gz",
"label":name
}
}
ds= self.get_datasource_metadata(datasource)
ds["genome_browser"]=gb
self.set_datasource_metadata(ds)
[docs]
def add_datasource(self,name,dataframe,columns=None,supplied_columns_only=False,replace_data=False,
add_to_view="default",separator="\t"):
'''Adds a pandas dataframe to the project. Each column's datatype, will be deduced by the
data it contains, but this is not always accurate. Hence, you can supply a list of column
metadata, which will override the names/types deduced from the dataframe.
Args:
name (string): The name of datasource
dataframe (dataframe|str): Either a pandas dataframe or the path of a text file
columns (list, optional) : A list of objects containing the column name and datatype.
e.g. [{"name":"column_1","datatype":"double"},]. If you want the column to have a
different label, the object requires a field (the column name in the dataframe) and
a name (the label seen by the user) e.g. {"field":"column_1","datatype":"double","name":"My Column 1"}
supplied_columns_only(bool, optional): If True, only the the subset of columns in the columns argument
will be added to the datasource. Default is False
replace_data(bool, optional): If True, the existing datasource will be overwritten, Default is False,
in which case, trying to add a datasource which already exists, will throw an error.
add_to_view (string, optional): The datasource will be added to the specified view. The view will
be created if it does not exist. The default is 'default'. If None, then it will not be added to
a view.
separator (str, optional): If a path to text file is supplied, then this should be the file's delimiter.
Defaults to a tab.
'''
if type(dataframe)==str:
dataframe= pandas.read_csv(dataframe,sep=separator)
#get the columns to add
columns= get_column_info(columns,dataframe,supplied_columns_only)
#does the datasource exist
try:
ds = self.get_datasource_metadata(self,name)
except:
ds= None
if ds:
#delete the datasource
if replace_data:
self.delete_datasource(name)
else:
raise FileExistsError(f"Trying to create {name} datasource, which already exits")
#create the h5 group
h5 = self._get_h5_handle()
gr= h5.create_group(name)
size = len(dataframe)
dodgy_columns=[]
for col in columns:
try:
add_column_to_group(col,dataframe[col["field"]],gr,size)
except Exception as e:
dodgy_columns.append(col["field"])
warnings.warn(f"cannot add column {col['field']} to datasource {name}\n{e}")
h5.close()
columns = [x for x in columns if x["field"] not in dodgy_columns]
#add the metadata
ds = None
ds = {
"name":name,
"columns":columns,
"size":size
}
self.set_datasource_metadata(ds)
#add it to the view
if add_to_view:
v = self.get_view(add_to_view)
if not v:
v={"initialCharts":{}}
v["initialCharts"][name]=[]
self.set_view(add_to_view,v)
[docs]
def insert_link(self,datasource,linkto,linktype,data):
ds = self.get_datasource_metadata(datasource)
links = ds.get("links")
if not links:
links={}
ds["links"]=links
llink = links.get(linkto)
if not llink:
llink={}
links[linkto]=llink
llink[linktype]=data
self.set_datasource_metadata(ds)
[docs]
def add_rows_as_columns_link(self,ds_row,ds_col,column_name,name):
data ={
"name_column":column_name,
"name":name,
"subgroups":{}
}
self.insert_link(ds_row,ds_col,"rows_as_columns",data)
[docs]
def add_rows_as_columns_subgroup(self,row_ds,col_ds,stub,data,name=None,label=None,sparse=False):
name = name if name else stub
label = label if label else name
h5 = self._get_h5_handle()
gr = h5[row_ds].create_group(name)
if sparse:
gr.create_dataset("x",(len(data.data),),data=data.data,dtype=numpy.float32)
gr.create_dataset("i",(len(data.indices),),data=data.indices,dtype=numpy.uint32)
gr.create_dataset("p",(len(data.indptr),),data=data.indptr)
else:
l = data.shape[0]
total_len = data.shape[0] * data.shape[1]
gr.create_dataset("x",(total_len,),data=data.flatten("F"),dtype=numpy.float32)
gr["length"]=[l]
ds = self.get_datasource_metadata(row_ds)
ds["links"][col_ds]["rows_as_columns"]["subgroups"][stub]={
"name":name,
"label":label,
"type":"sparse" if sparse else "dense"
}
self.set_datasource_metadata(ds)
h5.close()
[docs]
def get_links(self,datasource,filter=None):
ds = self.get_datasource_metadata(datasource)
links=[]
lnks= ds.get("links")
if lnks:
for lnkto in lnks:
lnk = lnks[lnkto]
if (filter==None or lnk.get(filter)):
links.append({
"datasource":lnkto,
"link":lnk
})
return links
[docs]
def serve(self,**kwargs):
from .server import create_app
create_app(self,**kwargs)
[docs]
def get_configs(self):
config ={
"datasources":self.datasources,
"state":self.state,
}
#legacy
hyperion_conf= join(self.dir,"hyperion_config.json")
if os.path.exists(hyperion_conf):
config["hyperion_config"]= get_json(hyperion_conf)
#end
return config
[docs]
def convert_to_static_page(self,outdir,include_sab_headers=True):
fdir = split(os.path.abspath(__file__))[0]
tdir = join(fdir,"templates")
#copy everything except the data
copytree(self.dir,outdir,ignore=ignore_patterns("*.h5"))
#copy the js and images
copytree(join(fdir,"static"),join(outdir,"static"))
#create the static binary files
self.convert_data_to_binary(outdir)
#write out the index file
page = "page.html"
template = join(tdir,page)
page = open(template).read()
#dummy popout page
copyfile(join(tdir,"popout.html"),join(outdir,"popout.html"))
#call init with the static folder argument
page=page.replace("_mdvInit()","_mdvInit(true)")
#correct config
conf = self.state
#can't edit static page
conf["permission"]="view"
#throttle the dataloading so don't get network errors
conf["dataloading"]={
"split":5,
"threads":2
}
save_json(join(outdir,"state.json"),conf)
#add service worker for cross origin headers
if include_sab_headers:
page=page.replace("<!--sw-->",'<script src="serviceworker.js"></script>')
copyfile(join(tdir,"serviceworker.js"),join(outdir,"serviceworker.js"))
with open(join(outdir,"index.html"),"w") as o:
o.write(page)
[docs]
def save_state(self,state):
#update/add or view
#view will be deleted if view is null
if state.get("currentView"):
self.set_view(state["currentView"],state["view"])
ud= state.get("updatedColumns")
#update/add/delete any columns
if ud:
for ds in ud:
item= ud[ds]
for data in item["colors_changed"]:
self.set_column_metadata(ds,data["column"],"colors",data["colors"])
for data in item["columns"]:
self.set_column_with_raw_data(ds,data["metadata"],data["data"])
for col in item["removed"]:
self.remove_column(ds,col)
#update any datasource metadata
md = state.get("metadata")
if md:
for ds in md:
datasource= self.get_datasource_metadata(ds)
for param in md[ds]:
datasource[param]=md[ds][param]
self.set_datasource_metadata(datasource)
[docs]
def add_image_set(self,datasource,setname,column,folder,type="png"):
'''Adds a set of images to a datasource. The images should be in a folder, with the same name as the column
Args:
datasource (str): The name of the datasource.
column (str): The name of the column.
folder (str): The path to the folder containing the images.
'''
ds = self.get_datasource_metadata(datasource)
col =self.get_column_metadata(datasource,column)
images = [x for x in os.listdir(folder) if x.endswith(type)]
#create the image folder
fname= secure_filename(setname)
imdir = join(self.imagefolder,fname)
if not exists(imdir):
os.makedirs(imdir)
#copy the images
for im in images:
copyfile(join(folder,im),join(imdir,im))
#update the metadata
if not ds.get("images"):
ds["images"]={}
ds["images"][setname]={
"key_column":column,
"type":type,
"base_url":f"./images/{fname}/"
}
self.set_datasource_metadata(ds)
[docs]
def get_view(self,view):
views = self.views
return views.get(view)
[docs]
def set_view(self,name,view,make_default=False):
views = self.views
#update or add the view
if view:
views[name]=view
#remove the view
else:
if views.get(name):
del views[name]
self.views=views
state =self.state
#add to list and make default
if view:
if not name in state["all_views"]:
state["all_views"].append(name)
if make_default:
state["initial_view"]=name
#delete from list
else:
state["all_views"].remove(name)
iv = state.get("initial_view")
#if the deleted view is the default view then
#change the default view to the first view in the list
if iv:
state["initial_view"]=state["all_views"][0]
self.state=state
[docs]
def convert_data_to_binary(self,outdir=None):
if not outdir:
outdir=self.dir
h5 = h5py.File(self.h5file)
dss = self.datasources
for ds in dss:
n = ds["name"]
gr = h5[n]
dfile = join(outdir,"{}.gz".format(n))
o = open(dfile,"wb")
index={}
current_pos=0
for c in ds["columns"]:
dt = gr.get(c["field"])
if not dt:
continue
arr = numpy.array(dt)
comp = gzip.compress(arr.tobytes())
o.write(comp)
new_pos = current_pos +len(comp)
index[c["field"]]=[current_pos,new_pos-1]
current_pos = new_pos
#add rows to columns gene score / tensors etc
lnks = self.get_links(n,"rows_as_columns")
for ln in lnks:
rc= ln["link"]["rows_as_columns"]
for sg in rc["subgroups"]:
info = rc["subgroups"][sg]
sgrp = gr[info["name"]]
sparse = info.get("type")=="sparse"
#get number of rows in linked datasource
plen = [x["size"] for x in dss if x["name"]==ln["datasource"]][0]
for i in range (0,plen):
comp= gzip.compress(get_subgroup_bytes(sgrp,i,sparse))
o.write(comp)
new_pos = current_pos +len(comp)
index[f'{sg}{i}']=[current_pos,new_pos-1]
current_pos = new_pos
o.close()
ifile = dfile[:dfile.rindex(".")]+".json"
i = open (ifile,"w")
i.write(json.dumps(index))
i.close()
[docs]
def get_byte_data(self,columns,group):
h5 = h5py.File(self.h5file,"r")
byte_list=[]
for column in columns:
sg = column.get("subgroup")
if sg:
sgindex= int(column["sgindex"])
byte_list.append(get_subgroup_bytes(h5[group][sg],sgindex,column.get("sgtype")=="sparse"))
else:
data = h5[group][column["field"]]
byte_list.append(numpy.array(data).tobytes())
h5.close()
return b''.join(byte_list)
[docs]
def set_region_data(self,datasource,data,region_field="sample_id",
default_color="annotations",
position_fields=["x","y"],scale_unit="um",scale=1,):
md = self.get_datasource_metadata(datasource)
cols = set([x["field"] for x in md["columns"]])
missing = [x for x in [region_field]+[default_color]+position_fields if not x in cols]
if len(missing) >0:
raise AttributeError(f"setting region data on {datasource} but the specified columns({','.join(missing)}) are missing")
md["regions"]={
"position_fields":position_fields,
"region_field":region_field,
"default_color":default_color,
"scale_unit":scale_unit,
"scale":scale
}
#convert to dict
if not isinstance(data,dict):
df = pandas.read_csv(data,sep="\t")
df.set_index(df.columns[0],inplace=True)
data = df.to_dict("index")
all_regions={}
for k,v in data.items():
x = v.get("x_offset",0)
y = v.get("y_offset",0)
all_regions[k]={
"roi":{
"min_x":x,
"min_y":y,
"max_y":v["height"]+y,
"max_x":v["width"]+x
},
"images":{}
}
md["regions"]["all_regions"]=all_regions
self.set_datasource_metadata(md)
[docs]
def add_region_images(self,datasource,data):
imdir = join(self.dir,"images","regions")
if not exists(imdir):
os.makedirs(imdir)
md = self.get_datasource_metadata(datasource)
md["regions"]["base_url"]="images/regions/"
#convert flat file to dict
if not isinstance(data,dict):
df = pandas.read_csv(data,sep="\t")
df.set_index(df.columns[0],inplace=True)
data = df.to_dict("index")
all_regions = md["regions"]["all_regions"]
for k,v in data.items():
region = all_regions.get(k)
if not region:
raise AttributeError(f"adding image to non existant region ({k}) in {datasource}")
roi= region["roi"]
name = v.get("name")
x = v.get("offset_x",roi["min_x"])
y = v.get("offset_y",roi["min_y"])
region["default_image"]=name
reg={
"position":[x,y],
"height":v.get("height",roi["max_y"]-roi["min_y"]),
"width":v.get("width",roi["max_x"]-roi["min_x"]),
"name":name
}
#simple url
if v["path"].startswith("http"):
reg["url"]=v["path"]
#local file - need to copy to images directory
else:
im = split(v["path"])[1]
im_details= im.split(".")
newname= get_random_string()+"."+im_details[1]
shutil.copyfile(v["path"],join(imdir,newname))
reg["file"]=newname
all_regions[k]["images"][name]=reg
self.set_datasource_metadata(md)
[docs]
def add_viv_viewer(self, datasource,default_channels):
md = self.get_datasource_metadata(datasource)
reg = md.get("regions")
if not reg:
raise AttributeError(f"Adding viv viewer to {datasource}, which does not contain regions")
imdir = join(self.dir,"images","avivator")
if not exists(imdir):
os.makedirs(imdir)
reg["avivator"]={
"default_channels":default_channels,
"base_url":"images/avivator/"
}
self.set_datasource_metadata(md)
[docs]
def add_viv_images(self,datasource,data):
md = self.get_datasource_metadata(datasource)
try:
a=md["regions"]["avivator"]
except:
raise AttributeError(f"Adding viv images when viv viewer has not been specified")
all_regions = md["regions"]["all_regions"]
imdir = join(self.dir,"images","avivator")
if not isinstance(data,dict):
df = pandas.read_csv(data,sep="\t")
df.set_index(df.columns[0],inplace=True)
data = df.to_dict("index")
for k,v in data.items():
region = all_regions.get(k)
if not region:
raise AttributeError(f"adding image to non existant region ({k}) in {datasource}")
if v["path"].startswith("http"):
region["viv_image"]={
"url":v["path"]
}
#local file - need to copy to images directory
else:
newname= get_random_string()+".ome.tiff"
shutil.copyfile(v["path"],join(imdir,newname))
region["viv_image"]={
"file":newname
}
self.set_datasource_metadata(md)
[docs]
def get_json(file):
return json.loads(open(file).read())
[docs]
def save_json(file,data):
o = open(file,"w")
o.write(json.dumps(data,indent=2))
o.close()
[docs]
def get_subgroup_bytes(grp,index,sparse=False):
if sparse:
offset = grp["p"][index:index+2]
_len = offset[1]-offset[0]
_indexes = numpy.array(grp["i"][offset[0]:offset[1]])
_values= numpy.array(grp["x"][offset[0]:offset[1]],numpy.float32)
return numpy.array([_len],numpy.uint32).tobytes() \
+ numpy.array(_indexes).tobytes() \
+ numpy.array(_values).tobytes()
else:
_len =grp["length"][0]
offset= index*_len
return numpy.array(grp["x"][offset:offset+_len],numpy.float32).tobytes()
[docs]
def add_column_to_group(col,data,group,length):
if col["datatype"]=="text" or col["datatype"]=="unique" or col["datatype"]=="text16":
if data.dtype=="category":
data =data.cat.add_categories("ND")
data=data.fillna("ND")
values = data.value_counts()
if (len(values)<65537 and col["datatype"]!="unique"):
t8 = len(values)<257
col["datatype"]="text" if t8 else "text16"
dtype = numpy.ubyte if t8 else numpy.uint16
if not col.get("values"):
col["values"]= [ x for x in values.index if values[x] != 0 ]
vdict = {k: v for v, k in enumerate(col["values"])}
group.create_dataset(col["field"],length,dtype=dtype,data =data.map(vdict))
#convert to string
col["values"] = [str(x) for x in col["values"]]
else:
max_len=max(data.str.len())
utf8_type = h5py.string_dtype('utf-8',int(max_len))
col["datatype"]="unique"
col["stringLength"]=max_len
group.create_dataset(col["field"],length,data = data,dtype=utf8_type)
elif col["datatype"]=="multitext":
delim = col.get("delimiter",",")
values = set()
maxv=0
#first parse - get all possible values and max number
#of values in a single field
for v in data:
try:
vs = v.split(delim)
except:
continue
values.update([x.strip() for x in vs])
maxv = max(maxv,len(vs))
if "" in values:
values.remove("")
ndata = numpy.empty(shape=(length*maxv,))
ndata.fill(65535)
values = list(values)
#dict more efficient than index list
vmap = {k:v for v,k in enumerate(values)}
for i in range(0,length):
b= i*maxv
v= data[i]
if v=="":
continue
try:
vs = v.split(delim)
vs = [x.strip() for x in vs]
except:
continue
vs.sort()
for n in range(0,len(vs)):
ndata[b+n]=vmap[vs[n]]
col["values"]=values
col["stringLength"]=maxv
group.create_dataset(col["field"],length*maxv,data = ndata,dtype=numpy.uint16)
else:
dt = numpy.int32 if col["datatype"] == "int32" else numpy.float32
clean = data.apply(pandas.to_numeric,errors="coerce")
#faster but non=numeric values have to be certain values
# clean=data.replace("?",numpy.NaN).replace("ND",numpy.NaN).replace("None",numpy.NaN)
ds= group.create_dataset(col["field"],length,data = clean,dtype=dt)
#remove NaNs for min/max and quantiles
na = numpy.array(ds)
na = na[~numpy.isnan(na)]
col["minMax"]=[float(str(numpy.amin(na))),float(str(numpy.amax(na)))]
quantiles= [0.001,0.01,0.05]
col["quantiles"]={}
for q in quantiles:
col["quantiles"][str(q)]=[
numpy.percentile(na,100*q),
numpy.percentile(na,100*(1-q))
]
[docs]
def get_column_info(columns,dataframe,supplied_columns_only):
if columns:
for col in columns:
if not col.get("field"):
col["field"]=col["name"]
if not supplied_columns_only:
cols = [{"datatype":datatype_mappings[d.name], "name":c,"field":c} for d,c in zip(dataframe.dtypes,dataframe.columns)]
#replace with user given column metadata
if columns:
col_map={x["field"]:x for x in columns}
cols = [col_map.get(x["field"],x) for x in cols]
columns= cols
return columns
##!! will not work in windows and requires htslib installed
[docs]
def create_bed_gz_file(infile,outfile):
#need to sort
command = "sort -k1,1V -k2,2n -k3,3n {} > {}".format(shlex.quote(infile),shlex.quote(outfile))
os.system(command)
subprocess.run(["bgzip",outfile])
subprocess.run(["tabix",outfile+".gz"])
[docs]
def get_random_string(length=6):
return ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase + string.digits, k=length))