Skip to content
Snippets Groups Projects
Select Git revision
  • 38d3784f1870af98c9a537bf0a14269b386f6fe5
  • develop default protected
  • 5.5
  • 5.1
  • master protected
  • deprecated/4-22
6 results

UniLogBlueprintFunctionLibrary.cpp

Blame
  • computeFeatures.py 20.60 KiB
    # This file performs the feature computation given the already predicted segmentation maps
    
    import numpy as np
    import os
    import sys
    import cv2
    import torch
    import math
    import pandas as pd
    import csv
    import logging as log
    from PIL import Image
    import json
    from scipy.ndimage import distance_transform_edt
    
    import openslide as osl
    
    from scipy.ndimage import zoom
    import matplotlib.pyplot as plt
    
    from joblib import Parallel, delayed
    import multiprocessing
    
    from skimage.measure import regionprops
    from scipy.ndimage.measurements import label
    
    from skimage.color import rgb2gray
    from skimage.transform import rescale, resize
    from skimage.segmentation import flood, flood_fill
    from skimage.color import rgb2gray
    from skimage.segmentation import clear_border
    from skimage import filters
    from scipy.ndimage.morphology import binary_dilation, binary_closing, binary_fill_holes, binary_erosion, binary_opening
    import scipy.ndimage
    import scipy as sp
    from skimage.morphology import remove_small_objects
    
    
    # Takes a binary mask image and outputs its bounding box
    def getBoundingBox(img):
        rows = np.any(img, axis=1)
        cols = np.any(img, axis=0)
        rmin, rmax = np.where(rows)[0][[0, -1]]
        cmin, cmax = np.where(cols)[0][[0, -1]]
    
        return rmin, rmax, cmin, cmax
    
    
    # Computes sizes, diameters, and distance_to_closest_glom features for each glom
    def computeGlomFeatures(glomID):
        instanceBig = (temp == glomID)
        minX, maxX, minY, maxY = getBoundingBox(instanceBig)
        minX = max(minX-1, 0); maxX += 2; minY = max(minY-1, 0); maxY += 2
        instance = instanceBig[minX:maxX, minY:maxY]
    
        size = instance.sum() * sizeOfPixel
        diameter = np.max(distance_transform_edt(instance)) * lengthOfPixel * 2
    
        if glomN == 1:
            return [size, diameter, 0]
        else:
            shift_300um = int(300. / lengthOfPixel)
    
            minX = max(minX - shift_300um, 0); maxX += shift_300um; minY = max(minY - shift_300um, 0); maxY += shift_300um
            segmCrop = np.isin(temp[minX:maxX, minY:maxY], [0, glomID])
    
            while segmCrop.all():
                minX = max(minX - shift_300um, 0); maxX += shift_300um; minY = max(minY - shift_300um, 0); maxY += shift_300um
                segmCrop = np.isin(temp[minX:maxX, minY:maxY], [0, glomID]) # please note that this way of expanding rectangular search regions, feature computation is strongly accelerated (instead of masking search regions using ball-shaped masks)
    
            distMap = distance_transform_edt(np.logical_not(instanceBig[minX:maxX, minY:maxY]))
    
            nextStructureDistance = min(distMap[np.logical_not(segmCrop)]) * lengthOfPixel
    
            return [size, diameter, nextStructureDistance]
    
    
    # Computes sizes, diameters, and distance_to_closest_instance for each tubule
    def computeTubuleFeatures(tubuliID):
        instanceBig = segmResults == tubuliID
        minX, maxX, minY, maxY = getBoundingBox(instanceBig)
        minX = max(minX-1, 0); maxX += 2; minY = max(minY-1, 0); maxY += 2
        instance = instanceBig[minX:maxX, minY:maxY]
    
        size = instance.sum() * sizeOfPixel
        diameter = np.max(distance_transform_edt(instance)) * lengthOfPixel * 2
    
        minX = max(minX - shift, 0); maxX += shift; minY = max(minY - shift, 0); maxY += shift
        segmCrop = np.isin(segmResults[minX:maxX, minY:maxY], [0, tubuliID])
    
        while segmCrop.all():
            minX = max(minX - shift, 0); maxX += shift; minY = max(minY - shift, 0); maxY += shift
            segmCrop = np.isin(segmResults[minX:maxX, minY:maxY], [0, tubuliID])
    
        distMap = distance_transform_edt(np.logical_not(instanceBig[minX:maxX, minY:maxY]))
    
        nextStructureDistance = min(distMap[np.logical_not(segmCrop)]) * lengthOfPixel
    
        return [size, diameter, nextStructureDistance]
    
    
    # Computes sizes, diameters, and distance_to_closest_instance for each artery and its lumen
    def computeArteryFeatures(arteryID):
        instanceBig = (temp == arteryID)
        minX, maxX, minY, maxY = getBoundingBox(instanceBig)
        minX = max(minX-1, 0); maxX += 2; minY = max(minY-1, 0); maxY += 2
        instance = instanceBig[minX:maxX, minY:maxY]
        lumen = temp2[minX:maxX, minY:maxY]
    
        lumenMasked = np.logical_and(lumen, instance)
    
        size = instance.sum() * sizeOfPixel
        lumenSize = lumenMasked.sum() * sizeOfPixel
        arterialWallSize = size - lumenSize
    
        diameter = np.max(distance_transform_edt(instance)) * lengthOfPixel * 2
        diameterLumen = np.max(distance_transform_edt(lumenMasked)) * lengthOfPixel * 2
        diameterWall = (diameter - diameterLumen) / 2
    
        shiftX = (maxX - minX) // 2
        shiftY = (maxY - minY) // 2
        minX = max(minX - shiftX, 0); maxX += shiftX; minY = max(minY - shiftY, 0); maxY += shiftY
    
        segmCrop = np.logical_not(np.isin(segmResults[minX:maxX, minY:maxY], [0, 5, 6]))
    
        while not segmCrop.any():
            minX = max(minX - 20, 0); maxX += 20; minY = max(minY - 20, 0); maxY += 20
            segmCrop = np.logical_not(np.isin(segmResults[minX:maxX, minY:maxY], [0, 5, 6]))
    
        distMap = distance_transform_edt(np.logical_not(instanceBig[minX:maxX, minY:maxY]))
    
        nextStructureDistance = min(distMap[segmCrop]) * lengthOfPixel
    
        return [size, lumenSize, arterialWallSize, diameter, diameterLumen, diameterWall, nextStructureDistance]
    
    
    
    
    num_cores = multiprocessing.cpu_count()
    
    resultsRootFolder = '' # Folder comprising the predicted segmentation files
    featureRootFolder = '' # Folder to save the computed features of their structures
    
    if not os.path.exists(featureRootFolder):
        os.makedirs(featureRootFolder)
    
    ########
    LOAD_PRECOMPUTED_FEATURES = False # In case of adjustments of already computed features (e.g. adding a feature), one can load them in and only perform computation for new features
    ########
    
    tubuliInstanceID_StartsWith = 10 # => tubuli instances start with id 10
    bgLabel = 8 # => background label is 8
    
    lengthOfPixel = 0.33721 # in micrometers, since we used 174um sized image patches sampled into 516x516 images
    sizeOfPixel = lengthOfPixel * lengthOfPixel
    
    shift = 40
    
    # Set up logger to save console output
    log.basicConfig(
        level=log.INFO,
        format='%(asctime)s %(message)s',
        datefmt='%Y/%m/%d %I:%M:%S %p',
        handlers=[
            log.FileHandler(featureRootFolder + '/LOGS_computeFeatures.log', 'w'),
            log.StreamHandler(sys.stdout)
        ])
    logger = log.getLogger()
    
    tissuesWOinstances = []
    
    try:
        for dirName, subdirList, fileList in os.walk(resultsRootFolder):
            if dirName.endswith('/npyFiles'):
                allTissues = sorted(list(set(['_'.join(fname.split('_')[:-1]) for fname in fileList])))
                allSlides = list(set(['_'.join(fname.split('_')[:-2]) for fname in fileList]))
    
                resultsFolder = dirName.replace(resultsRootFolder, featureRootFolder)[:-9]
                if not os.path.exists(resultsFolder):
                    os.makedirs(resultsFolder)
    
                logger.info('Processing folder -> {} <-'.format(dirName.split('/')[-2]))
                logger.info('IN TOTAL: {} tissues from {} WSIs'.format(len(allTissues), len(allSlides)))
    
                for i, tissueName in enumerate(allTissues):
                    logger.info('[{}/{}] Process tissue: {}'.format(i + 1, len(allTissues), tissueName))
    
                    segmResults = np.load(dirName + '/' + tissueName + '_finalInstancePrediction.npy')
    
                    # LABEL SUMMARY: segmResults: label2-6 + label8 BG + label>=10 Tubuli, tubuliInstances: label>=1 Tubuli
    
                    ################################ START COMPUTING FEATURES ################################
                    tissueFeatures = {}
    
                    ####### CLASS INSTANCE NUMBERS
                    _, glomN = label(np.logical_or(segmResults == 2, segmResults == 3))
                    _, tuftN = label(segmResults == 3)
                    _, arteryN = label(np.logical_or(segmResults == 5, segmResults == 6))
                    _, lumenN = label(segmResults == 6)
    
                    # if no instances available, skip
                    if glomN == arteryN == max(segmResults.max() - tubuliInstanceID_StartsWith+1, 0) == 0:
                        logger.info("No instances available, skipping...")
                        tissuesWOinstances.append(tissueName)
                        continue
    
                    if LOAD_PRECOMPUTED_FEATURES:
                        tissueFeatures = pd.read_csv(resultsFolder + '/' + tissueName + '_features.csv').to_dict()
                        for key in tissueFeatures:
                            tissueFeatures[key] = np.array([no for no in list(tissueFeatures[key].values()) if not math.isnan(no)])
    
                    tissueFeatures['glom_N'] = [glomN]
                    tissueFeatures['tuft_N_segments'] = [tuftN]
                    tissueFeatures['artery_N'] = [arteryN]
                    tissueFeatures['lumen_N_segments'] = [lumenN]
                    tissueFeatures['tubule_N'] = [max(segmResults.max() - tubuliInstanceID_StartsWith+1, 0)]
    
                    logger.info('#Instances: {} Tubuli, {} Gloms, {} Tufts, {} Arteries, {} Lumina'.format(tissueFeatures['tubule_N'][0], glomN, tuftN, arteryN, lumenN))
    
                    ####### TISSUE AREA SIZES
                    fgSize = np.logical_and(segmResults != 8, np.logical_not(segmResults == 4)).sum()
                    tissueFeatures['_tissue_size'] = [fgSize * sizeOfPixel]
                    
                    ####### CLASS AREA COVERAGE IN WHOLE TISSUE
                    tissueFeatures['glom_area%'] = [(np.logical_or(segmResults == 2, segmResults == 3)).sum() / fgSize * 100.]
                    tissueFeatures['tuft_area%'] = [(segmResults == 3).sum() / fgSize * 100.]
                    tissueFeatures['artery_area%'] = [(np.logical_or(segmResults == 5, segmResults == 6)).sum() / fgSize * 100.]
                    tissueFeatures['lumen_area%'] = [(segmResults == 6).sum() / fgSize * 100.]
                    tissueFeatures['tubule_area%'] = [(segmResults >= tubuliInstanceID_StartsWith).sum() / fgSize * 100.]
                    tissueFeatures['interstitium_area%'] = [(segmResults == 0).sum() / fgSize * 100.]
    
                    ####### GLOM FEATURES
                    if glomN > 0:
                        ####### GLOM INSTANCE SIZES + DIAMETER + DISTANCE TO CLOSEST GLOM
                        temp, _ = label(np.logical_or(segmResults == 2, segmResults == 3))
                        # tissueFeatures['glom_sizes'] = np.array([e.area * sizeOfPixel for e in regionprops(temp)])
                        glom_features = np.array(Parallel(n_jobs=num_cores)(delayed(computeGlomFeatures)(i) for i in range(1, glomN+1)))
                        tissueFeatures['glom_sizes'] = glom_features[:, 0]
                        tissueFeatures['glom_diameters'] = glom_features[:, 1]
                        if glomN > 1:
                            tissueFeatures['glom_distance_to_closest_glom'] = glom_features[:, 2]
                        else:
                            tissueFeatures['glom_distance_to_closest_glom'] = []
                        
                        ####### CLASS INSTANCE SIZES FOR TUFTS AND BOWMAN SPACE
                        temp[segmResults == 3] = 0
                        bowmanSizes = np.array([e.area * sizeOfPixel for e in regionprops(temp)])
                        if len(bowmanSizes) == glomN:
                            tissueFeatures['glom_bowman_sizes'] = bowmanSizes
                        else:
                            correctedBowmanSizes = np.zeros(glomN)
                            correctedBowmanSizes[np.array([e.label for e in regionprops(temp)])-1] = bowmanSizes
                            tissueFeatures['glom_bowman_sizes'] = correctedBowmanSizes
                        tissueFeatures['glom_tuft_sizes'] = tissueFeatures['glom_sizes'] - tissueFeatures['glom_bowman_sizes']
                        
                        ####### GLOMERULI SHAPE FEATURES
                        temp, _ = label(np.logical_or(segmResults == 2, segmResults == 3))
                        regionPropsTemp = regionprops(temp)
                        tissueFeatures['glom_shape_solidity'] = np.array([e.solidity for e in regionPropsTemp])  # convexity measure
                        tissueFeatures['glom_shape_circularity'] = np.array([(4.*np.pi*e.area)/np.square(e.perimeter) for e in regionPropsTemp])  # circularity measure
                        tissueFeatures['glom_shape_elongation'] = np.array([1.- e.minor_axis_length/e.major_axis_length for e in regionPropsTemp])  # from 0-1, 0=circle, the higher the number, the elongated subject is
                        tissueFeatures['glom_shape_eccentricity'] = np.array([e.eccentricity for e in regionPropsTemp])  # eccentricity, inertia ratio: 0 = circle, 1 = ellipse
    
                        ####### GLOMERULI'S TUFT SHAPE FEATURES
                        temp, _ = label(np.logical_or(segmResults == 2, segmResults == 3))
                        temp = temp * (segmResults == 3)
                        regionPropsTemp = regionprops(temp)
                        tissueFeatures['glom_tuft_shape_solidity'] = np.array([e.solidity for e in regionPropsTemp])  # convexity measure
                        tissueFeatures['glom_tuft_shape_circularity'] = np.array([(4.*np.pi*e.area)/np.square(e.perimeter) for e in regionPropsTemp])  # circularity measure
                        tissueFeatures['glom_tuft_shape_elongation'] = np.array([1.- e.minor_axis_length/e.major_axis_length for e in regionPropsTemp])  # from 0-1, 0=circle, the higher the number, the elongated subject is
                        tissueFeatures['glom_tuft_shape_eccentricity'] = np.array([e.eccentricity for e in regionPropsTemp])  # eccentricity, inertia ratio: 0 = circle, 1 = ellipse
    
                        assert len(tissueFeatures['glom_tuft_shape_solidity']) == len(tissueFeatures['glom_tuft_shape_circularity']) == len(tissueFeatures['glom_tuft_shape_elongation']) == len(tissueFeatures['glom_tuft_shape_eccentricity']), "CARE: GLOM TUFT FEATURES UNEQUAL SIZE!"
                        if len(tissueFeatures['glom_tuft_shape_solidity']) != glomN:
                            correctedArray = np.zeros(glomN) - 1
                            if len(tissueFeatures['glom_tuft_shape_solidity']) == 0:
                                tissueFeatures['glom_tuft_shape_solidity'] = tissueFeatures['glom_tuft_shape_circularity'] = tissueFeatures['glom_tuft_shape_elongation'] = tissueFeatures['glom_tuft_shape_eccentricity'] = correctedArray
                            else:
                                indexOfFeatures = np.array([e.label for e in regionPropsTemp])-1
                                correctedArray[indexOfFeatures] = tissueFeatures['glom_tuft_shape_solidity']; tissueFeatures['glom_tuft_shape_solidity'] = correctedArray
                                correctedArray = np.zeros(glomN) - 1
                                correctedArray[indexOfFeatures] = tissueFeatures['glom_tuft_shape_circularity']; tissueFeatures['glom_tuft_shape_circularity'] = correctedArray
                                correctedArray = np.zeros(glomN) - 1
                                correctedArray[indexOfFeatures] = tissueFeatures['glom_tuft_shape_elongation']; tissueFeatures['glom_tuft_shape_elongation'] = correctedArray
                                correctedArray = np.zeros(glomN) - 1
                                correctedArray[indexOfFeatures] = tissueFeatures['glom_tuft_shape_eccentricity']; tissueFeatures['glom_tuft_shape_eccentricity'] = correctedArray
    
                        if 'tuft_shape_solidity' in tissueFeatures:
                            del tissueFeatures['tuft_shape_solidity']
                        if 'tuft_shape_circularity' in tissueFeatures:
                            del tissueFeatures['tuft_shape_circularity']
                        if 'tuft_shape_elongation' in tissueFeatures:
                            del tissueFeatures['tuft_shape_elongation']
                        if 'tuft_shape_eccentricity' in tissueFeatures:
                            del tissueFeatures['tuft_shape_eccentricity']
    
                    # ####### TUFT FEATURES ON INSTANCE-LEVEL
                    # if tuftN > 0:
                    #     ####### TUFT SHAPE FEATURES
                    #     temp, _ = label(segmResults == 3)
                    #     regionPropsTemp = regionprops(temp)
                    #     tissueFeatures['tuft_shape_solidity'] = np.array([e.solidity for e in regionPropsTemp])  # convexity measure
                    #     tissueFeatures['tuft_shape_circularity'] = np.array([(4.*np.pi*e.area)/np.square(e.perimeter) for e in regionPropsTemp])  # circularity measure
                    #     tissueFeatures['tuft_shape_elongation'] = np.array([1.- e.minor_axis_length/e.major_axis_length for e in regionPropsTemp])  # from 0-1, 0=circle, the higher the number, the elongated subject is
                    #     tissueFeatures['tuft_shape_eccentricity'] = np.array([e.eccentricity for e in regionPropsTemp])  # eccentricity, inertia ratio: 0 = circle, 1 = ellipse
    
                    ####### TUBULE FEATURES
                    if tissueFeatures['tubule_N'][0] == 0:
                        tissueFeatures['tubule_sizes'], tissueFeatures['tubule_diameters'], tissueFeatures['tubule_distance_to_closest_instance'] = [], [], []
                    else:
                        tubule_features = np.array(Parallel(n_jobs=num_cores)(delayed(computeTubuleFeatures)(i) for i in range(tubuliInstanceID_StartsWith, segmResults.max()+1)))
                        tissueFeatures['tubule_sizes'] = tubule_features[:, 0]
                        tissueFeatures['tubule_diameters'] = tubule_features[:, 1]
                        tissueFeatures['tubule_distance_to_closest_instance'] = tubule_features[:, 2]
    
                    # ARTERY FEATURES
                    if arteryN == 0:
                        tissueFeatures['artery_sizes'], tissueFeatures['artery_sizes_lumen'], tissueFeatures['artery_sizes_wall'], tissueFeatures['artery_diameters'], tissueFeatures['artery_diameters_lumen'], tissueFeatures['artery_diameters_wall'], tissueFeatures['artery_structure_distance'] = [], [], [], [], [], [], []
                    else:
                        temp, _ = label(np.logical_or(segmResults == 5, segmResults == 6))
                        temp2 = segmResults == 6
                        artery_features = np.array(Parallel(n_jobs=num_cores)(delayed(computeArteryFeatures)(i) for i in range(1, arteryN+1)))
                        tissueFeatures['artery_sizes'] = artery_features[:, 0]
                        tissueFeatures['artery_sizes_lumen'] = artery_features[:, 1]
                        tissueFeatures['artery_sizes_wall'] = artery_features[:, 2]
                        tissueFeatures['artery_diameters'] = artery_features[:, 3]
                        tissueFeatures['artery_diameters_lumen'] = artery_features[:, 4]
                        tissueFeatures['artery_diameters_wall'] = artery_features[:, 5]
                        tissueFeatures['artery_structure_distance'] = artery_features[:, 6]
    
    
                    # for key in tissueFeatures.keys():
                    for key in list(tissueFeatures):
                        value = tissueFeatures[key]
                        if not (type(value).__module__ == 'numpy' and value.dtype == 'bool'):
                            tissueFeatures[key] = np.round(value, 2)
                            # if len(value) > 1:
                            #     tissueFeatures[key + '_MEDIAN'] = [np.round(np.median(value), 2)]
                            #     tissueFeatures[key + '_MEAN'] = [np.round(np.mean(value), 2)]
                            #     tissueFeatures[key + '_STD'] = [np.round(np.std(value), 2)]
                        else:
                            if len(value) > 1:
                                tissueFeatures[key + '_%'] = [np.round(value.mean() * 100, 1)]
    
    
                    resultsDict = dict(sorted(tissueFeatures.items(), key=lambda item: item[0]))
    
                    resultsPD = pd.DataFrame.from_dict(resultsDict, orient='index').transpose()
                    resultsPD.to_csv(resultsFolder + '/' + tissueName + '_features.csv', index=False, header=True, encoding='utf-8')
    
    
    except:
        logger.exception('! Exception !')
        raise
    
    log.info('Amount of tissue samples without instances: {}'.format(len(tissuesWOinstances)))
    if len(tissuesWOinstances) > 0:
        log.info('Their Filenames: {}'.format(tissuesWOinstances))
    log.info('%%%% Ended regularly ! %%%%')
    
    
    ######################## Browse through all features and print structure counts
    # import os
    # import pandas as pd
    #
    # path = ""
    #
    # wsiCounter = 0
    # tissueCounter = 0
    #
    # tubuleN = 0
    # glomN = 0
    # tuftN = 0
    # arteryN = 0
    # lumenN = 0
    #
    # for dirName, subdirList, fileList in os.walk(path):
    #     filesCSV = [fname for fname in fileList if fname.endswith('.csv')]
    #     if len(filesCSV) > 0:
    #         tissueCounter += len(filesCSV)
    #         wsiCounter += len(set(['_'.join(fname.split('_')[:-2]) for fname in filesCSV]))
    #
    #         for fname in filesCSV:
    #             resultsDict = pd.read_csv(os.path.join(dirName, fname)).to_dict()
    #             tubuleN += resultsDict['tubule_N'][0]
    #             glomN += resultsDict['glom_N'][0]
    #             tuftN += resultsDict['tuft_N_segments'][0]
    #             arteryN += resultsDict['artery_N'][0]
    #             lumenN += resultsDict['lumen_N_segments'][0]
    #
    # print('Analyzed WSI: {}, analyzed tissues: {}'.format(wsiCounter, tissueCounter))
    # print('Tubuli: {}'.format(tubuleN))
    # print('Glom: {}'.format(glomN))
    # print('Tuft: {}'.format(tuftN))
    # print('Artery: {}'.format(arteryN))
    # print('Lumen: {}'.format(lumenN))