diff --git a/computeFeatures.py b/computeFeatures.py new file mode 100644 index 0000000000000000000000000000000000000000..f9164e848212c5f5ebabf496604bdc492a6fb57c --- /dev/null +++ b/computeFeatures.py @@ -0,0 +1,388 @@ +# This file performs the feature computation given the already predicted segmentation maps + +import numpy as np +import os +import sys +import cv2 +import torch +import math +import pandas as pd +import csv +import logging as log +from PIL import Image +import json +from scipy.ndimage import distance_transform_edt + +import openslide as osl + +from scipy.ndimage import zoom +import matplotlib.pyplot as plt + +from joblib import Parallel, delayed +import multiprocessing + +from skimage.measure import regionprops +from scipy.ndimage.measurements import label + +from skimage.color import rgb2gray +from skimage.transform import rescale, resize +from skimage.segmentation import flood, flood_fill +from skimage.color import rgb2gray +from skimage.segmentation import clear_border +from skimage import filters +from scipy.ndimage.morphology import binary_dilation, binary_closing, binary_fill_holes, binary_erosion, binary_opening +import scipy.ndimage +import scipy as sp +from skimage.morphology import remove_small_objects + +from utils import getBoundingBox + + +# Computes sizes, diameters, and distance_to_closest_glom features for each glom +def computeGlomFeatures(glomID): + instanceBig = (temp == glomID) + minX, maxX, minY, maxY = getBoundingBox(instanceBig) + minX = max(minX-1, 0); maxX += 2; minY = max(minY-1, 0); maxY += 2 + instance = instanceBig[minX:maxX, minY:maxY] + + size = instance.sum() * sizeOfPixel + diameter = np.max(distance_transform_edt(instance)) * lengthOfPixel * 2 + + if glomN == 1: + return [size, diameter, 0] + else: + shift_300um = int(300. / lengthOfPixel) + + minX = max(minX - shift_300um, 0); maxX += shift_300um; minY = max(minY - shift_300um, 0); maxY += shift_300um + segmCrop = np.isin(temp[minX:maxX, minY:maxY], [0, glomID]) + + while segmCrop.all(): + minX = max(minX - shift_300um, 0); maxX += shift_300um; minY = max(minY - shift_300um, 0); maxY += shift_300um + segmCrop = np.isin(temp[minX:maxX, minY:maxY], [0, glomID]) # please note that this way of expanding rectangular search regions, feature computation is strongly accelerated (instead of masking search regions using ball-shaped masks) + + distMap = distance_transform_edt(np.logical_not(instanceBig[minX:maxX, minY:maxY])) + + nextStructureDistance = min(distMap[np.logical_not(segmCrop)]) * lengthOfPixel + + return [size, diameter, nextStructureDistance] + + +# Computes sizes, diameters, and distance_to_closest_instance for each tubule +def computeTubuleFeatures(tubuliID): + instanceBig = segmResults == tubuliID + minX, maxX, minY, maxY = getBoundingBox(instanceBig) + minX = max(minX-1, 0); maxX += 2; minY = max(minY-1, 0); maxY += 2 + instance = instanceBig[minX:maxX, minY:maxY] + + size = instance.sum() * sizeOfPixel + diameter = np.max(distance_transform_edt(instance)) * lengthOfPixel * 2 + + minX = max(minX - shift, 0); maxX += shift; minY = max(minY - shift, 0); maxY += shift + segmCrop = np.isin(segmResults[minX:maxX, minY:maxY], [0, tubuliID]) + + while segmCrop.all(): + minX = max(minX - shift, 0); maxX += shift; minY = max(minY - shift, 0); maxY += shift + segmCrop = np.isin(segmResults[minX:maxX, minY:maxY], [0, tubuliID]) + + distMap = distance_transform_edt(np.logical_not(instanceBig[minX:maxX, minY:maxY])) + + nextStructureDistance = min(distMap[np.logical_not(segmCrop)]) * lengthOfPixel + + return [size, diameter, nextStructureDistance] + + +# Computes sizes, diameters, and distance_to_closest_instance for each artery and its lumen +def computeArteryFeatures(arteryID): + instanceBig = (temp == arteryID) + minX, maxX, minY, maxY = getBoundingBox(instanceBig) + minX = max(minX-1, 0); maxX += 2; minY = max(minY-1, 0); maxY += 2 + instance = instanceBig[minX:maxX, minY:maxY] + lumen = temp2[minX:maxX, minY:maxY] + + lumenMasked = np.logical_and(lumen, instance) + + size = instance.sum() * sizeOfPixel + lumenSize = lumenMasked.sum() * sizeOfPixel + arterialWallSize = size - lumenSize + + diameter = np.max(distance_transform_edt(instance)) * lengthOfPixel * 2 + diameterLumen = np.max(distance_transform_edt(lumenMasked)) * lengthOfPixel * 2 + diameterWall = (diameter - diameterLumen) / 2 + + shiftX = (maxX - minX) // 2 + shiftY = (maxY - minY) // 2 + minX = max(minX - shiftX, 0); maxX += shiftX; minY = max(minY - shiftY, 0); maxY += shiftY + + segmCrop = np.logical_not(np.isin(segmResults[minX:maxX, minY:maxY], [0, 5, 6])) + + while not segmCrop.any(): + minX = max(minX - 20, 0); maxX += 20; minY = max(minY - 20, 0); maxY += 20 + segmCrop = np.logical_not(np.isin(segmResults[minX:maxX, minY:maxY], [0, 5, 6])) + + distMap = distance_transform_edt(np.logical_not(instanceBig[minX:maxX, minY:maxY])) + + nextStructureDistance = min(distMap[segmCrop]) * lengthOfPixel + + return [size, lumenSize, arterialWallSize, diameter, diameterLumen, diameterWall, nextStructureDistance] + + + + +num_cores = multiprocessing.cpu_count() + +resultsRootFolder = '' # Folder comprising the predicted segmentation files +featureRootFolder = '' # Folder to save the computed features of their structures + +if not os.path.exists(featureRootFolder): + os.makedirs(featureRootFolder) + +######## +LOAD_PRECOMPUTED_FEATURES = False # In case of adjustments of already computed features (e.g. adding a feature), one can load them in and only perform computation for new features +######## + +tubuliInstanceID_StartsWith = 10 # => tubuli instances start with id 10 +bgLabel = 8 # => background label is 8 + +lengthOfPixel = 0.33721 # in micrometers, since we used 174um sized image patches sampled into 516x516 images +sizeOfPixel = lengthOfPixel * lengthOfPixel + +shift = 40 + +# Set up logger to save console output +log.basicConfig( + level=log.INFO, + format='%(asctime)s %(message)s', + datefmt='%Y/%m/%d %I:%M:%S %p', + handlers=[ + log.FileHandler(featureRootFolder + '/LOGS_computeFeatures.log', 'w'), + log.StreamHandler(sys.stdout) + ]) +logger = log.getLogger() + +tissuesWOinstances = [] + +try: + for dirName, subdirList, fileList in os.walk(resultsRootFolder): + if dirName.endswith('/npyFiles'): + allTissues = sorted(list(set(['_'.join(fname.split('_')[:-1]) for fname in fileList]))) + allSlides = list(set(['_'.join(fname.split('_')[:-2]) for fname in fileList])) + + resultsFolder = dirName.replace(resultsRootFolder, featureRootFolder)[:-9] + if not os.path.exists(resultsFolder): + os.makedirs(resultsFolder) + + logger.info('Processing folder -> {} <-'.format(dirName.split('/')[-2])) + logger.info('IN TOTAL: {} tissues from {} WSIs'.format(len(allTissues), len(allSlides))) + + for i, tissueName in enumerate(allTissues): + logger.info('[{}/{}] Process tissue: {}'.format(i + 1, len(allTissues), tissueName)) + + segmResults = np.load(dirName + '/' + tissueName + '_finalInstancePrediction.npy') + + # LABEL SUMMARY: segmResults: label2-6 + label8 BG + label>=10 Tubuli, tubuliInstances: label>=1 Tubuli + + ################################ START COMPUTING FEATURES ################################ + tissueFeatures = {} + + ####### CLASS INSTANCE NUMBERS + _, glomN = label(np.logical_or(segmResults == 2, segmResults == 3)) + _, tuftN = label(segmResults == 3) + _, arteryN = label(np.logical_or(segmResults == 5, segmResults == 6)) + _, lumenN = label(segmResults == 6) + + # if no instances available, skip + if glomN == arteryN == max(segmResults.max() - tubuliInstanceID_StartsWith+1, 0) == 0: + logger.info("No instances available, skipping...") + tissuesWOinstances.append(tissueName) + continue + + if LOAD_PRECOMPUTED_FEATURES: + tissueFeatures = pd.read_csv(resultsFolder + '/' + tissueName + '_features.csv').to_dict() + for key in tissueFeatures: + tissueFeatures[key] = np.array([no for no in list(tissueFeatures[key].values()) if not math.isnan(no)]) + + tissueFeatures['glom_N'] = [glomN] + tissueFeatures['tuft_N_segments'] = [tuftN] + tissueFeatures['artery_N'] = [arteryN] + tissueFeatures['lumen_N_segments'] = [lumenN] + tissueFeatures['tubule_N'] = [max(segmResults.max() - tubuliInstanceID_StartsWith+1, 0)] + + logger.info('#Instances: {} Tubuli, {} Gloms, {} Tufts, {} Arteries, {} Lumina'.format(tissueFeatures['tubule_N'][0], glomN, tuftN, arteryN, lumenN)) + + ####### TISSUE AREA SIZES + fgSize = np.logical_and(segmResults != 8, np.logical_not(segmResults == 4)).sum() + tissueFeatures['_tissue_size'] = [fgSize * sizeOfPixel] + + ####### CLASS AREA COVERAGE IN WHOLE TISSUE + tissueFeatures['glom_area%'] = [(np.logical_or(segmResults == 2, segmResults == 3)).sum() / fgSize * 100.] + tissueFeatures['tuft_area%'] = [(segmResults == 3).sum() / fgSize * 100.] + tissueFeatures['artery_area%'] = [(np.logical_or(segmResults == 5, segmResults == 6)).sum() / fgSize * 100.] + tissueFeatures['lumen_area%'] = [(segmResults == 6).sum() / fgSize * 100.] + tissueFeatures['tubule_area%'] = [(segmResults >= tubuliInstanceID_StartsWith).sum() / fgSize * 100.] + tissueFeatures['interstitium_area%'] = [(segmResults == 0).sum() / fgSize * 100.] + + ####### GLOM FEATURES + if glomN > 0: + ####### GLOM INSTANCE SIZES + DIAMETER + DISTANCE TO CLOSEST GLOM + temp, _ = label(np.logical_or(segmResults == 2, segmResults == 3)) + # tissueFeatures['glom_sizes'] = np.array([e.area * sizeOfPixel for e in regionprops(temp)]) + glom_features = np.array(Parallel(n_jobs=num_cores)(delayed(computeGlomFeatures)(i) for i in range(1, glomN+1))) + tissueFeatures['glom_sizes'] = glom_features[:, 0] + tissueFeatures['glom_diameters'] = glom_features[:, 1] + if glomN > 1: + tissueFeatures['glom_distance_to_closest_glom'] = glom_features[:, 2] + else: + tissueFeatures['glom_distance_to_closest_glom'] = [] + + ####### CLASS INSTANCE SIZES FOR TUFTS AND BOWMAN SPACE + temp[segmResults == 3] = 0 + bowmanSizes = np.array([e.area * sizeOfPixel for e in regionprops(temp)]) + if len(bowmanSizes) == glomN: + tissueFeatures['glom_bowman_sizes'] = bowmanSizes + else: + correctedBowmanSizes = np.zeros(glomN) + correctedBowmanSizes[np.array([e.label for e in regionprops(temp)])-1] = bowmanSizes + tissueFeatures['glom_bowman_sizes'] = correctedBowmanSizes + tissueFeatures['glom_tuft_sizes'] = tissueFeatures['glom_sizes'] - tissueFeatures['glom_bowman_sizes'] + + ####### GLOMERULI SHAPE FEATURES + temp, _ = label(np.logical_or(segmResults == 2, segmResults == 3)) + regionPropsTemp = regionprops(temp) + tissueFeatures['glom_shape_solidity'] = np.array([e.solidity for e in regionPropsTemp]) # convexity measure + tissueFeatures['glom_shape_circularity'] = np.array([(4.*np.pi*e.area)/np.square(e.perimeter) for e in regionPropsTemp]) # circularity measure + tissueFeatures['glom_shape_elongation'] = np.array([1.- e.minor_axis_length/e.major_axis_length for e in regionPropsTemp]) # from 0-1, 0=circle, the higher the number, the elongated subject is + tissueFeatures['glom_shape_eccentricity'] = np.array([e.eccentricity for e in regionPropsTemp]) # eccentricity, inertia ratio: 0 = circle, 1 = ellipse + + ####### GLOMERULI'S TUFT SHAPE FEATURES + temp, _ = label(np.logical_or(segmResults == 2, segmResults == 3)) + temp = temp * (segmResults == 3) + regionPropsTemp = regionprops(temp) + tissueFeatures['glom_tuft_shape_solidity'] = np.array([e.solidity for e in regionPropsTemp]) # convexity measure + tissueFeatures['glom_tuft_shape_circularity'] = np.array([(4.*np.pi*e.area)/np.square(e.perimeter) for e in regionPropsTemp]) # circularity measure + tissueFeatures['glom_tuft_shape_elongation'] = np.array([1.- e.minor_axis_length/e.major_axis_length for e in regionPropsTemp]) # from 0-1, 0=circle, the higher the number, the elongated subject is + tissueFeatures['glom_tuft_shape_eccentricity'] = np.array([e.eccentricity for e in regionPropsTemp]) # eccentricity, inertia ratio: 0 = circle, 1 = ellipse + + assert len(tissueFeatures['glom_tuft_shape_solidity']) == len(tissueFeatures['glom_tuft_shape_circularity']) == len(tissueFeatures['glom_tuft_shape_elongation']) == len(tissueFeatures['glom_tuft_shape_eccentricity']), "CARE: GLOM TUFT FEATURES UNEQUAL SIZE!" + if len(tissueFeatures['glom_tuft_shape_solidity']) != glomN: + correctedArray = np.zeros(glomN) - 1 + if len(tissueFeatures['glom_tuft_shape_solidity']) == 0: + tissueFeatures['glom_tuft_shape_solidity'] = tissueFeatures['glom_tuft_shape_circularity'] = tissueFeatures['glom_tuft_shape_elongation'] = tissueFeatures['glom_tuft_shape_eccentricity'] = correctedArray + else: + indexOfFeatures = np.array([e.label for e in regionPropsTemp])-1 + correctedArray[indexOfFeatures] = tissueFeatures['glom_tuft_shape_solidity']; tissueFeatures['glom_tuft_shape_solidity'] = correctedArray + correctedArray = np.zeros(glomN) - 1 + correctedArray[indexOfFeatures] = tissueFeatures['glom_tuft_shape_circularity']; tissueFeatures['glom_tuft_shape_circularity'] = correctedArray + correctedArray = np.zeros(glomN) - 1 + correctedArray[indexOfFeatures] = tissueFeatures['glom_tuft_shape_elongation']; tissueFeatures['glom_tuft_shape_elongation'] = correctedArray + correctedArray = np.zeros(glomN) - 1 + correctedArray[indexOfFeatures] = tissueFeatures['glom_tuft_shape_eccentricity']; tissueFeatures['glom_tuft_shape_eccentricity'] = correctedArray + + if 'tuft_shape_solidity' in tissueFeatures: + del tissueFeatures['tuft_shape_solidity'] + if 'tuft_shape_circularity' in tissueFeatures: + del tissueFeatures['tuft_shape_circularity'] + if 'tuft_shape_elongation' in tissueFeatures: + del tissueFeatures['tuft_shape_elongation'] + if 'tuft_shape_eccentricity' in tissueFeatures: + del tissueFeatures['tuft_shape_eccentricity'] + + # ####### TUFT FEATURES ON INSTANCE-LEVEL + # if tuftN > 0: + # ####### TUFT SHAPE FEATURES + # temp, _ = label(segmResults == 3) + # regionPropsTemp = regionprops(temp) + # tissueFeatures['tuft_shape_solidity'] = np.array([e.solidity for e in regionPropsTemp]) # convexity measure + # tissueFeatures['tuft_shape_circularity'] = np.array([(4.*np.pi*e.area)/np.square(e.perimeter) for e in regionPropsTemp]) # circularity measure + # tissueFeatures['tuft_shape_elongation'] = np.array([1.- e.minor_axis_length/e.major_axis_length for e in regionPropsTemp]) # from 0-1, 0=circle, the higher the number, the elongated subject is + # tissueFeatures['tuft_shape_eccentricity'] = np.array([e.eccentricity for e in regionPropsTemp]) # eccentricity, inertia ratio: 0 = circle, 1 = ellipse + + ####### TUBULE FEATURES + if tissueFeatures['tubule_N'][0] == 0: + tissueFeatures['tubule_sizes'], tissueFeatures['tubule_diameters'], tissueFeatures['tubule_distance_to_closest_instance'] = [], [], [] + else: + tubule_features = np.array(Parallel(n_jobs=num_cores)(delayed(computeTubuleFeatures)(i) for i in range(tubuliInstanceID_StartsWith, segmResults.max()+1))) + tissueFeatures['tubule_sizes'] = tubule_features[:, 0] + tissueFeatures['tubule_diameters'] = tubule_features[:, 1] + tissueFeatures['tubule_distance_to_closest_instance'] = tubule_features[:, 2] + + # ARTERY FEATURES + if arteryN == 0: + tissueFeatures['artery_sizes'], tissueFeatures['artery_sizes_lumen'], tissueFeatures['artery_sizes_wall'], tissueFeatures['artery_diameters'], tissueFeatures['artery_diameters_lumen'], tissueFeatures['artery_diameters_wall'], tissueFeatures['artery_structure_distance'] = [], [], [], [], [], [], [] + else: + temp, _ = label(np.logical_or(segmResults == 5, segmResults == 6)) + temp2 = segmResults == 6 + artery_features = np.array(Parallel(n_jobs=num_cores)(delayed(computeArteryFeatures)(i) for i in range(1, arteryN+1))) + tissueFeatures['artery_sizes'] = artery_features[:, 0] + tissueFeatures['artery_sizes_lumen'] = artery_features[:, 1] + tissueFeatures['artery_sizes_wall'] = artery_features[:, 2] + tissueFeatures['artery_diameters'] = artery_features[:, 3] + tissueFeatures['artery_diameters_lumen'] = artery_features[:, 4] + tissueFeatures['artery_diameters_wall'] = artery_features[:, 5] + tissueFeatures['artery_structure_distance'] = artery_features[:, 6] + + + # for key in tissueFeatures.keys(): + for key in list(tissueFeatures): + value = tissueFeatures[key] + if not (type(value).__module__ == 'numpy' and value.dtype == 'bool'): + tissueFeatures[key] = np.round(value, 2) + # if len(value) > 1: + # tissueFeatures[key + '_MEDIAN'] = [np.round(np.median(value), 2)] + # tissueFeatures[key + '_MEAN'] = [np.round(np.mean(value), 2)] + # tissueFeatures[key + '_STD'] = [np.round(np.std(value), 2)] + else: + if len(value) > 1: + tissueFeatures[key + '_%'] = [np.round(value.mean() * 100, 1)] + + + resultsDict = dict(sorted(tissueFeatures.items(), key=lambda item: item[0])) + + resultsPD = pd.DataFrame.from_dict(resultsDict, orient='index').transpose() + resultsPD.to_csv(resultsFolder + '/' + tissueName + '_features.csv', index=False, header=True, encoding='utf-8') + + +except: + logger.exception('! Exception !') + raise + +log.info('Amount of tissue samples without instances: {}'.format(len(tissuesWOinstances))) +if len(tissuesWOinstances) > 0: + log.info('Their Filenames: {}'.format(tissuesWOinstances)) +log.info('%%%% Ended regularly ! %%%%') + + +######################## Browse through all features and print structure counts +# import os +# import pandas as pd +# +# path = "" +# +# wsiCounter = 0 +# tissueCounter = 0 +# +# tubuleN = 0 +# glomN = 0 +# tuftN = 0 +# arteryN = 0 +# lumenN = 0 +# +# for dirName, subdirList, fileList in os.walk(path): +# filesCSV = [fname for fname in fileList if fname.endswith('.csv')] +# if len(filesCSV) > 0: +# tissueCounter += len(filesCSV) +# wsiCounter += len(set(['_'.join(fname.split('_')[:-2]) for fname in filesCSV])) +# +# for fname in filesCSV: +# resultsDict = pd.read_csv(os.path.join(dirName, fname)).to_dict() +# tubuleN += resultsDict['tubule_N'][0] +# glomN += resultsDict['glom_N'][0] +# tuftN += resultsDict['tuft_N_segments'][0] +# arteryN += resultsDict['artery_N'][0] +# lumenN += resultsDict['lumen_N_segments'][0] +# +# print('Analyzed WSI: {}, analyzed tissues: {}'.format(wsiCounter, tissueCounter)) +# print('Tubuli: {}'.format(tubuleN)) +# print('Glom: {}'.format(glomN)) +# print('Tuft: {}'.format(tuftN)) +# print('Artery: {}'.format(arteryN)) +# print('Lumen: {}'.format(lumenN)) \ No newline at end of file