Select Git revision
VAAudioInputSignalSource.cpp
computeFeatures.py 20.60 KiB
# This file performs the feature computation given the already predicted segmentation maps
import numpy as np
import os
import sys
import cv2
import torch
import math
import pandas as pd
import csv
import logging as log
from PIL import Image
import json
from scipy.ndimage import distance_transform_edt
import openslide as osl
from scipy.ndimage import zoom
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import multiprocessing
from skimage.measure import regionprops
from scipy.ndimage.measurements import label
from skimage.color import rgb2gray
from skimage.transform import rescale, resize
from skimage.segmentation import flood, flood_fill
from skimage.color import rgb2gray
from skimage.segmentation import clear_border
from skimage import filters
from scipy.ndimage.morphology import binary_dilation, binary_closing, binary_fill_holes, binary_erosion, binary_opening
import scipy.ndimage
import scipy as sp
from skimage.morphology import remove_small_objects
# Takes a binary mask image and outputs its bounding box
def getBoundingBox(img):
rows = np.any(img, axis=1)
cols = np.any(img, axis=0)
rmin, rmax = np.where(rows)[0][[0, -1]]
cmin, cmax = np.where(cols)[0][[0, -1]]
return rmin, rmax, cmin, cmax
# Computes sizes, diameters, and distance_to_closest_glom features for each glom
def computeGlomFeatures(glomID):
instanceBig = (temp == glomID)
minX, maxX, minY, maxY = getBoundingBox(instanceBig)
minX = max(minX-1, 0); maxX += 2; minY = max(minY-1, 0); maxY += 2
instance = instanceBig[minX:maxX, minY:maxY]
size = instance.sum() * sizeOfPixel
diameter = np.max(distance_transform_edt(instance)) * lengthOfPixel * 2
if glomN == 1:
return [size, diameter, 0]
else:
shift_300um = int(300. / lengthOfPixel)
minX = max(minX - shift_300um, 0); maxX += shift_300um; minY = max(minY - shift_300um, 0); maxY += shift_300um
segmCrop = np.isin(temp[minX:maxX, minY:maxY], [0, glomID])
while segmCrop.all():
minX = max(minX - shift_300um, 0); maxX += shift_300um; minY = max(minY - shift_300um, 0); maxY += shift_300um
segmCrop = np.isin(temp[minX:maxX, minY:maxY], [0, glomID]) # please note that this way of expanding rectangular search regions, feature computation is strongly accelerated (instead of masking search regions using ball-shaped masks)
distMap = distance_transform_edt(np.logical_not(instanceBig[minX:maxX, minY:maxY]))
nextStructureDistance = min(distMap[np.logical_not(segmCrop)]) * lengthOfPixel
return [size, diameter, nextStructureDistance]
# Computes sizes, diameters, and distance_to_closest_instance for each tubule
def computeTubuleFeatures(tubuliID):
instanceBig = segmResults == tubuliID
minX, maxX, minY, maxY = getBoundingBox(instanceBig)
minX = max(minX-1, 0); maxX += 2; minY = max(minY-1, 0); maxY += 2
instance = instanceBig[minX:maxX, minY:maxY]
size = instance.sum() * sizeOfPixel
diameter = np.max(distance_transform_edt(instance)) * lengthOfPixel * 2
minX = max(minX - shift, 0); maxX += shift; minY = max(minY - shift, 0); maxY += shift
segmCrop = np.isin(segmResults[minX:maxX, minY:maxY], [0, tubuliID])
while segmCrop.all():
minX = max(minX - shift, 0); maxX += shift; minY = max(minY - shift, 0); maxY += shift
segmCrop = np.isin(segmResults[minX:maxX, minY:maxY], [0, tubuliID])
distMap = distance_transform_edt(np.logical_not(instanceBig[minX:maxX, minY:maxY]))
nextStructureDistance = min(distMap[np.logical_not(segmCrop)]) * lengthOfPixel
return [size, diameter, nextStructureDistance]
# Computes sizes, diameters, and distance_to_closest_instance for each artery and its lumen
def computeArteryFeatures(arteryID):
instanceBig = (temp == arteryID)
minX, maxX, minY, maxY = getBoundingBox(instanceBig)
minX = max(minX-1, 0); maxX += 2; minY = max(minY-1, 0); maxY += 2
instance = instanceBig[minX:maxX, minY:maxY]
lumen = temp2[minX:maxX, minY:maxY]
lumenMasked = np.logical_and(lumen, instance)
size = instance.sum() * sizeOfPixel
lumenSize = lumenMasked.sum() * sizeOfPixel
arterialWallSize = size - lumenSize
diameter = np.max(distance_transform_edt(instance)) * lengthOfPixel * 2
diameterLumen = np.max(distance_transform_edt(lumenMasked)) * lengthOfPixel * 2
diameterWall = (diameter - diameterLumen) / 2
shiftX = (maxX - minX) // 2
shiftY = (maxY - minY) // 2
minX = max(minX - shiftX, 0); maxX += shiftX; minY = max(minY - shiftY, 0); maxY += shiftY
segmCrop = np.logical_not(np.isin(segmResults[minX:maxX, minY:maxY], [0, 5, 6]))
while not segmCrop.any():
minX = max(minX - 20, 0); maxX += 20; minY = max(minY - 20, 0); maxY += 20
segmCrop = np.logical_not(np.isin(segmResults[minX:maxX, minY:maxY], [0, 5, 6]))
distMap = distance_transform_edt(np.logical_not(instanceBig[minX:maxX, minY:maxY]))
nextStructureDistance = min(distMap[segmCrop]) * lengthOfPixel
return [size, lumenSize, arterialWallSize, diameter, diameterLumen, diameterWall, nextStructureDistance]
num_cores = multiprocessing.cpu_count()
resultsRootFolder = '' # Folder comprising the predicted segmentation files
featureRootFolder = '' # Folder to save the computed features of their structures
if not os.path.exists(featureRootFolder):
os.makedirs(featureRootFolder)
########
LOAD_PRECOMPUTED_FEATURES = False # In case of adjustments of already computed features (e.g. adding a feature), one can load them in and only perform computation for new features
########
tubuliInstanceID_StartsWith = 10 # => tubuli instances start with id 10
bgLabel = 8 # => background label is 8
lengthOfPixel = 0.33721 # in micrometers, since we used 174um sized image patches sampled into 516x516 images
sizeOfPixel = lengthOfPixel * lengthOfPixel
shift = 40
# Set up logger to save console output
log.basicConfig(
level=log.INFO,
format='%(asctime)s %(message)s',
datefmt='%Y/%m/%d %I:%M:%S %p',
handlers=[
log.FileHandler(featureRootFolder + '/LOGS_computeFeatures.log', 'w'),
log.StreamHandler(sys.stdout)
])
logger = log.getLogger()
tissuesWOinstances = []
try:
for dirName, subdirList, fileList in os.walk(resultsRootFolder):
if dirName.endswith('/npyFiles'):
allTissues = sorted(list(set(['_'.join(fname.split('_')[:-1]) for fname in fileList])))
allSlides = list(set(['_'.join(fname.split('_')[:-2]) for fname in fileList]))
resultsFolder = dirName.replace(resultsRootFolder, featureRootFolder)[:-9]
if not os.path.exists(resultsFolder):
os.makedirs(resultsFolder)
logger.info('Processing folder -> {} <-'.format(dirName.split('/')[-2]))
logger.info('IN TOTAL: {} tissues from {} WSIs'.format(len(allTissues), len(allSlides)))
for i, tissueName in enumerate(allTissues):
logger.info('[{}/{}] Process tissue: {}'.format(i + 1, len(allTissues), tissueName))
segmResults = np.load(dirName + '/' + tissueName + '_finalInstancePrediction.npy')
# LABEL SUMMARY: segmResults: label2-6 + label8 BG + label>=10 Tubuli, tubuliInstances: label>=1 Tubuli
################################ START COMPUTING FEATURES ################################
tissueFeatures = {}
####### CLASS INSTANCE NUMBERS
_, glomN = label(np.logical_or(segmResults == 2, segmResults == 3))
_, tuftN = label(segmResults == 3)
_, arteryN = label(np.logical_or(segmResults == 5, segmResults == 6))
_, lumenN = label(segmResults == 6)
# if no instances available, skip
if glomN == arteryN == max(segmResults.max() - tubuliInstanceID_StartsWith+1, 0) == 0:
logger.info("No instances available, skipping...")
tissuesWOinstances.append(tissueName)
continue
if LOAD_PRECOMPUTED_FEATURES:
tissueFeatures = pd.read_csv(resultsFolder + '/' + tissueName + '_features.csv').to_dict()
for key in tissueFeatures:
tissueFeatures[key] = np.array([no for no in list(tissueFeatures[key].values()) if not math.isnan(no)])
tissueFeatures['glom_N'] = [glomN]
tissueFeatures['tuft_N_segments'] = [tuftN]
tissueFeatures['artery_N'] = [arteryN]
tissueFeatures['lumen_N_segments'] = [lumenN]
tissueFeatures['tubule_N'] = [max(segmResults.max() - tubuliInstanceID_StartsWith+1, 0)]
logger.info('#Instances: {} Tubuli, {} Gloms, {} Tufts, {} Arteries, {} Lumina'.format(tissueFeatures['tubule_N'][0], glomN, tuftN, arteryN, lumenN))
####### TISSUE AREA SIZES
fgSize = np.logical_and(segmResults != 8, np.logical_not(segmResults == 4)).sum()
tissueFeatures['_tissue_size'] = [fgSize * sizeOfPixel]
####### CLASS AREA COVERAGE IN WHOLE TISSUE
tissueFeatures['glom_area%'] = [(np.logical_or(segmResults == 2, segmResults == 3)).sum() / fgSize * 100.]
tissueFeatures['tuft_area%'] = [(segmResults == 3).sum() / fgSize * 100.]
tissueFeatures['artery_area%'] = [(np.logical_or(segmResults == 5, segmResults == 6)).sum() / fgSize * 100.]
tissueFeatures['lumen_area%'] = [(segmResults == 6).sum() / fgSize * 100.]
tissueFeatures['tubule_area%'] = [(segmResults >= tubuliInstanceID_StartsWith).sum() / fgSize * 100.]
tissueFeatures['interstitium_area%'] = [(segmResults == 0).sum() / fgSize * 100.]
####### GLOM FEATURES
if glomN > 0:
####### GLOM INSTANCE SIZES + DIAMETER + DISTANCE TO CLOSEST GLOM
temp, _ = label(np.logical_or(segmResults == 2, segmResults == 3))
# tissueFeatures['glom_sizes'] = np.array([e.area * sizeOfPixel for e in regionprops(temp)])
glom_features = np.array(Parallel(n_jobs=num_cores)(delayed(computeGlomFeatures)(i) for i in range(1, glomN+1)))
tissueFeatures['glom_sizes'] = glom_features[:, 0]
tissueFeatures['glom_diameters'] = glom_features[:, 1]
if glomN > 1:
tissueFeatures['glom_distance_to_closest_glom'] = glom_features[:, 2]
else:
tissueFeatures['glom_distance_to_closest_glom'] = []
####### CLASS INSTANCE SIZES FOR TUFTS AND BOWMAN SPACE
temp[segmResults == 3] = 0
bowmanSizes = np.array([e.area * sizeOfPixel for e in regionprops(temp)])
if len(bowmanSizes) == glomN:
tissueFeatures['glom_bowman_sizes'] = bowmanSizes
else:
correctedBowmanSizes = np.zeros(glomN)
correctedBowmanSizes[np.array([e.label for e in regionprops(temp)])-1] = bowmanSizes
tissueFeatures['glom_bowman_sizes'] = correctedBowmanSizes
tissueFeatures['glom_tuft_sizes'] = tissueFeatures['glom_sizes'] - tissueFeatures['glom_bowman_sizes']
####### GLOMERULI SHAPE FEATURES
temp, _ = label(np.logical_or(segmResults == 2, segmResults == 3))
regionPropsTemp = regionprops(temp)
tissueFeatures['glom_shape_solidity'] = np.array([e.solidity for e in regionPropsTemp]) # convexity measure
tissueFeatures['glom_shape_circularity'] = np.array([(4.*np.pi*e.area)/np.square(e.perimeter) for e in regionPropsTemp]) # circularity measure
tissueFeatures['glom_shape_elongation'] = np.array([1.- e.minor_axis_length/e.major_axis_length for e in regionPropsTemp]) # from 0-1, 0=circle, the higher the number, the elongated subject is
tissueFeatures['glom_shape_eccentricity'] = np.array([e.eccentricity for e in regionPropsTemp]) # eccentricity, inertia ratio: 0 = circle, 1 = ellipse
####### GLOMERULI'S TUFT SHAPE FEATURES
temp, _ = label(np.logical_or(segmResults == 2, segmResults == 3))
temp = temp * (segmResults == 3)
regionPropsTemp = regionprops(temp)
tissueFeatures['glom_tuft_shape_solidity'] = np.array([e.solidity for e in regionPropsTemp]) # convexity measure
tissueFeatures['glom_tuft_shape_circularity'] = np.array([(4.*np.pi*e.area)/np.square(e.perimeter) for e in regionPropsTemp]) # circularity measure
tissueFeatures['glom_tuft_shape_elongation'] = np.array([1.- e.minor_axis_length/e.major_axis_length for e in regionPropsTemp]) # from 0-1, 0=circle, the higher the number, the elongated subject is
tissueFeatures['glom_tuft_shape_eccentricity'] = np.array([e.eccentricity for e in regionPropsTemp]) # eccentricity, inertia ratio: 0 = circle, 1 = ellipse
assert len(tissueFeatures['glom_tuft_shape_solidity']) == len(tissueFeatures['glom_tuft_shape_circularity']) == len(tissueFeatures['glom_tuft_shape_elongation']) == len(tissueFeatures['glom_tuft_shape_eccentricity']), "CARE: GLOM TUFT FEATURES UNEQUAL SIZE!"
if len(tissueFeatures['glom_tuft_shape_solidity']) != glomN:
correctedArray = np.zeros(glomN) - 1
if len(tissueFeatures['glom_tuft_shape_solidity']) == 0:
tissueFeatures['glom_tuft_shape_solidity'] = tissueFeatures['glom_tuft_shape_circularity'] = tissueFeatures['glom_tuft_shape_elongation'] = tissueFeatures['glom_tuft_shape_eccentricity'] = correctedArray
else:
indexOfFeatures = np.array([e.label for e in regionPropsTemp])-1
correctedArray[indexOfFeatures] = tissueFeatures['glom_tuft_shape_solidity']; tissueFeatures['glom_tuft_shape_solidity'] = correctedArray
correctedArray = np.zeros(glomN) - 1
correctedArray[indexOfFeatures] = tissueFeatures['glom_tuft_shape_circularity']; tissueFeatures['glom_tuft_shape_circularity'] = correctedArray
correctedArray = np.zeros(glomN) - 1
correctedArray[indexOfFeatures] = tissueFeatures['glom_tuft_shape_elongation']; tissueFeatures['glom_tuft_shape_elongation'] = correctedArray
correctedArray = np.zeros(glomN) - 1
correctedArray[indexOfFeatures] = tissueFeatures['glom_tuft_shape_eccentricity']; tissueFeatures['glom_tuft_shape_eccentricity'] = correctedArray
if 'tuft_shape_solidity' in tissueFeatures:
del tissueFeatures['tuft_shape_solidity']
if 'tuft_shape_circularity' in tissueFeatures:
del tissueFeatures['tuft_shape_circularity']
if 'tuft_shape_elongation' in tissueFeatures:
del tissueFeatures['tuft_shape_elongation']
if 'tuft_shape_eccentricity' in tissueFeatures:
del tissueFeatures['tuft_shape_eccentricity']
# ####### TUFT FEATURES ON INSTANCE-LEVEL
# if tuftN > 0:
# ####### TUFT SHAPE FEATURES
# temp, _ = label(segmResults == 3)
# regionPropsTemp = regionprops(temp)
# tissueFeatures['tuft_shape_solidity'] = np.array([e.solidity for e in regionPropsTemp]) # convexity measure
# tissueFeatures['tuft_shape_circularity'] = np.array([(4.*np.pi*e.area)/np.square(e.perimeter) for e in regionPropsTemp]) # circularity measure
# tissueFeatures['tuft_shape_elongation'] = np.array([1.- e.minor_axis_length/e.major_axis_length for e in regionPropsTemp]) # from 0-1, 0=circle, the higher the number, the elongated subject is
# tissueFeatures['tuft_shape_eccentricity'] = np.array([e.eccentricity for e in regionPropsTemp]) # eccentricity, inertia ratio: 0 = circle, 1 = ellipse
####### TUBULE FEATURES
if tissueFeatures['tubule_N'][0] == 0:
tissueFeatures['tubule_sizes'], tissueFeatures['tubule_diameters'], tissueFeatures['tubule_distance_to_closest_instance'] = [], [], []
else:
tubule_features = np.array(Parallel(n_jobs=num_cores)(delayed(computeTubuleFeatures)(i) for i in range(tubuliInstanceID_StartsWith, segmResults.max()+1)))
tissueFeatures['tubule_sizes'] = tubule_features[:, 0]
tissueFeatures['tubule_diameters'] = tubule_features[:, 1]
tissueFeatures['tubule_distance_to_closest_instance'] = tubule_features[:, 2]
# ARTERY FEATURES
if arteryN == 0:
tissueFeatures['artery_sizes'], tissueFeatures['artery_sizes_lumen'], tissueFeatures['artery_sizes_wall'], tissueFeatures['artery_diameters'], tissueFeatures['artery_diameters_lumen'], tissueFeatures['artery_diameters_wall'], tissueFeatures['artery_structure_distance'] = [], [], [], [], [], [], []
else:
temp, _ = label(np.logical_or(segmResults == 5, segmResults == 6))
temp2 = segmResults == 6
artery_features = np.array(Parallel(n_jobs=num_cores)(delayed(computeArteryFeatures)(i) for i in range(1, arteryN+1)))
tissueFeatures['artery_sizes'] = artery_features[:, 0]
tissueFeatures['artery_sizes_lumen'] = artery_features[:, 1]
tissueFeatures['artery_sizes_wall'] = artery_features[:, 2]
tissueFeatures['artery_diameters'] = artery_features[:, 3]
tissueFeatures['artery_diameters_lumen'] = artery_features[:, 4]
tissueFeatures['artery_diameters_wall'] = artery_features[:, 5]
tissueFeatures['artery_structure_distance'] = artery_features[:, 6]
# for key in tissueFeatures.keys():
for key in list(tissueFeatures):
value = tissueFeatures[key]
if not (type(value).__module__ == 'numpy' and value.dtype == 'bool'):
tissueFeatures[key] = np.round(value, 2)
# if len(value) > 1:
# tissueFeatures[key + '_MEDIAN'] = [np.round(np.median(value), 2)]
# tissueFeatures[key + '_MEAN'] = [np.round(np.mean(value), 2)]
# tissueFeatures[key + '_STD'] = [np.round(np.std(value), 2)]
else:
if len(value) > 1:
tissueFeatures[key + '_%'] = [np.round(value.mean() * 100, 1)]
resultsDict = dict(sorted(tissueFeatures.items(), key=lambda item: item[0]))
resultsPD = pd.DataFrame.from_dict(resultsDict, orient='index').transpose()
resultsPD.to_csv(resultsFolder + '/' + tissueName + '_features.csv', index=False, header=True, encoding='utf-8')
except:
logger.exception('! Exception !')
raise
log.info('Amount of tissue samples without instances: {}'.format(len(tissuesWOinstances)))
if len(tissuesWOinstances) > 0:
log.info('Their Filenames: {}'.format(tissuesWOinstances))
log.info('%%%% Ended regularly ! %%%%')
######################## Browse through all features and print structure counts
# import os
# import pandas as pd
#
# path = ""
#
# wsiCounter = 0
# tissueCounter = 0
#
# tubuleN = 0
# glomN = 0
# tuftN = 0
# arteryN = 0
# lumenN = 0
#
# for dirName, subdirList, fileList in os.walk(path):
# filesCSV = [fname for fname in fileList if fname.endswith('.csv')]
# if len(filesCSV) > 0:
# tissueCounter += len(filesCSV)
# wsiCounter += len(set(['_'.join(fname.split('_')[:-2]) for fname in filesCSV]))
#
# for fname in filesCSV:
# resultsDict = pd.read_csv(os.path.join(dirName, fname)).to_dict()
# tubuleN += resultsDict['tubule_N'][0]
# glomN += resultsDict['glom_N'][0]
# tuftN += resultsDict['tuft_N_segments'][0]
# arteryN += resultsDict['artery_N'][0]
# lumenN += resultsDict['lumen_N_segments'][0]
#
# print('Analyzed WSI: {}, analyzed tissues: {}'.format(wsiCounter, tissueCounter))
# print('Tubuli: {}'.format(tubuleN))
# print('Glom: {}'.format(glomN))
# print('Tuft: {}'.format(tuftN))
# print('Artery: {}'.format(arteryN))
# print('Lumen: {}'.format(lumenN))