In [21]:
import json
import re
import os

#load phrase mining result (First half of TopMine)
lines=[]
with open("data/corpus.txt") as f:
    data = f.readlines()
    for line in data:
        lines.append(line)

In [22]:
# eliminate double space
def process_double_space(phrase):
    new_phrase=re.split(' ', phrase)
    if('' in new_phrase):
        new_phrase.remove('')
    new_phrase = ' '.join(new_phrase)
    return new_phrase

In [23]:
# build a ditionary for frequency
phrase_freq={}
phrase_list=[]
phrase_index={}


count=0
for line in lines:
    phrases=re.split('\n|,',line.replace('-',' '))
    phrases.remove('')
    sub_list=[]
    for phrase in phrases:
        new_phrase=process_double_space(phrase)
        if not new_phrase in phrase_index:
            phrase_index[new_phrase]=count
            count+=1
        if new_phrase in phrase_freq:
            phrase_freq[new_phrase]+=1
        else:
            phrase_freq[new_phrase]=1
        sub_list.append(new_phrase)
    phrase_list.append(sub_list)

In [24]:
# check size of dictionary
print(len(phrase_freq))
print(len(phrase_index))


4463
4463

In [25]:
#function for break down long phrase
def regroup_phrase(phrase):
    group=[]
    new_phrase=re.split(' ', phrase)
    max_size=len(new_phrase)    
    for i in range(0, max_size):
        for j in range(0, len(new_phrase)-i):
            new_string=new_phrase[j]
            for k in range(i):
                new_string+=' '
                new_string+=new_phrase[j+k+1]
            group.append(new_string)
    return group
            
regroup_phrase('a b c')


Out[25]:
['a', 'b', 'c', 'a b', 'b c', 'a b c']

In [26]:
#break long phrase into more sub phrases, as long as sub phrases is in the original dictionary
import math
import numpy

super_phrase_list=[]
super_phrase_freq={}
phrase_docfreq={}
phrase_idf=numpy.zeros(len(phrase_index))

for line in phrase_list:
    temp=set()
    sub_list=[]
    for phrase in line:
        group=regroup_phrase(phrase)
        for i in group:
            if i in phrase_index:
                temp.add(i)
                sub_list.append(i)
                if(i in super_phrase_freq):
                    super_phrase_freq[i]+=1
                else:
                    super_phrase_freq[i]=1
    for i in temp:
        if i in phrase_docfreq:
            phrase_docfreq[i]+=1
        else:
            phrase_docfreq[i]=1
    super_phrase_list.append(sub_list)
    
#calculate Idf
for key in phrase_docfreq:
    count=phrase_docfreq[key]
    phrase_idf[phrase_index[key]]=math.log(len(super_phrase_list)/float(count),2)

#check size of dicitonary and size of document
print(len(super_phrase_freq))
print(len(phrase_docfreq))
print(len(super_phrase_list))


4463
4463
1554

In [27]:
#generate Tf-Idf for all papers
texts_vec=[]

for line in super_phrase_list:
    vec=numpy.zeros(len(phrase_index))   
    #accumulate local count
    for p in line:
        vec[phrase_index[p]]+=1        
    #calculate tf
    vec/=float(len(line))
    #get tf-idf by multiplying idf
    vec*=phrase_idf
    texts_vec.append(vec)

In [28]:
#similarity function = cosine distance
from numpy import linalg as la
def get_similarity(a,b):
    return numpy.dot(a,b)/(la.norm(a)*la.norm(b))

print get_similarity(texts_vec[0],texts_vec[8])
print get_similarity(texts_vec[0],texts_vec[0])


0.00300707728668
1.0

In [29]:
#add similarity information into json
with open("data/Paper_2014_clean.json", "r") as f:
    p_data = json.load(f)

In [30]:
# make sure the index of json match the index of our result
print p_data[1553]['abstract']
print super_phrase_list[1553]


This study used finite element models to assess potential benefits of selected unconventional features, implemented in an experimental car, for vehicle crashworthiness in frontal impact. These safety features include: structural energy-absorbing bumper, hood lockdown with optimized hood and extendable bumper. The A-pillar intrusion and the effective acceleration of the vehicle were used as the parameters for measuring frontal impact crashworthiness performance.
['study', 'finite', 'element', 'model', 'finite element', 'element model', 'finite element model', 'assessment', 'potential', 'benefits', 'selection', 'unconventional', 'features', 'implementation', 'experimental', 'car', 'vehicle', 'crashworthiness', 'frontal', 'impact', 'safety', 'features', 'including', 'structure', 'hood', 'optimization', 'hood', 'extended', 'intrusion', 'effectiveness', 'acceleration', 'vehicle', 'parameters', 'measure', 'frontal', 'impact', 'crashworthiness', 'performance', 'math', 'based', 'math based', 'Performance Evaluation', 'experimental', 'car', 'frontal', 'impact', 'crashworthiness', 'math', 'based', 'math based', 'Performance Evaluation', 'experimental', 'car', 'frontal', 'impact', 'crashworthiness', 'math', 'based', 'math based', 'Performance Evaluation', 'experimental', 'car', 'frontal', 'impact', 'crashworthiness']

In [31]:
# get the similarity score for citaitons and add it into json
for p in p_data:
    
    idx=int(p['index'])
    cite_sim=[]
    cite_by_sim=[]
    all_cite_sim=[]
    
    for i in p['citations']:
        s=get_similarity(texts_vec[idx],texts_vec[int(i)])
        cite_sim.append(s)
        all_cite_sim.append(s)
        
    for i in p['cited_by']:
        s=get_similarity(texts_vec[idx],texts_vec[int(i)])
        cite_by_sim.append(s)
        all_cite_sim.append(s)
    
    
    #json doesn't support numpy array
    #put all similarity score into json
    p['citations_sim']=cite_sim
    p['cited_by_sim']=cite_by_sim
    p['all_cite_sim']=all_cite_sim

In [32]:
#sanity check
p=p_data[1]
print p_data[803]['abstract']
print p_data[803]['broad_topic']
print ' '
print p_data[108]['abstract']
print p_data[108]['broad_topic']
print ' '
print p['abstract']
print p['broad_topic']
print ' '
print p['citations']
print p['citations_sim']


Most structural products have complex geometry to meet customer’s demand of high functionality. Since manufacturing those products in one piece is either impossible or uneconomical, most structural products are assemblies of components with simpler geometries. The conventional way to design structural assemblies is to design overall geometry first, and then decompose the geometry to determine the part boundary and joint locations. This two-step process, however, can lead to sub-optimal designs since the product geometry, even if optimized as one piece, would not be optimal after decomposition. This paper presents a method for synthesizing structural assemblies directly from the design specifications, without going through the two-step process. Given an extended design domain with boundary and loading conditions, the method simultaneously optimizes the topology and geometry of an entire structure and the location and configuration of joints, considering structural performance, manufacturability, and assembleability. As a relaxation of our previous work utilizing a beam-based ground structure [1], this paper presents a new formulation in a continuum design domain, which greatly enhances the ability to represent complex structural geometry observed in real-world products. A multi-objective genetic algorithm is used to obtain Pareto optimal solutions that exhibits trade-offs among stiffness, weight, manufacturability, and assembleability.

 
Level-set approaches are a family of domain classification techniques that rely on defining a scalar level-set function (LSF), then carrying out the classification based on the value of the function relative to one or more thresholds. Most continuum topology optimization formulations are at heart, a classification problem of the design domain into structural materials and void. As such, level-set approaches are gaining acceptance and popularity in structural topology optimization. In conventional level set approaches, finding an optimum LSF involves solution of a Hamilton-Jacobi system of partial differential equations with a large number of degrees of freedom, which in turn, cannot be accomplished without gradients information of the objective being optimized. A new approach is proposed in this paper where design variables are defined as the explicit values of the LSF at knot points, then a Kriging model is used to interpolate the LSF values within the rest of the domain so that classification into material or void can be performed. Perceived advantages of the explicit level-set (ELS) approach include alleviating the need for gradients of objectives and constraints, while maintaining a reasonable number of design variables that is independent from the mesh size. A hybrid genetic algorithm (GA) is then used for solving the optimization problem(s). An example problem of a short cantilever is studied under various settings of the ELS parameters in order to infer the best practice recommendations for tuning the approach. Capabilities of the approach are then further demonstrated by exploring its performance on several test problems.
Application-Tailored Optimization Methods
 
Level-set methods are domain classification techniques that are gaining popularity in the recent years for structural topology optimization. Level sets classify a domain into two or more categories (such as material and void) by examining the value of a scalar level-set function (LSF) defined in the entire design domain. In most level-set formulations, a large number of design variables, or degrees of freedom is used to define the LSF, which implicitly defines the structure. The large number of design variables makes non-gradient optimization techniques all but ineffective. Kriging-interpolated level sets (KLS) on the other hand are formulated with an objective to enable non-gradient optimization by defining the design variables as the LSF values at few select locations (knot points) and using a Kriging model to interpolate the LSF in the rest of the design domain. A downside of concern when adopting KLS, is that using too few knot points may limit the capability to represent complex shapes, while using too many knot points may cause difficulty for non-gradient optimization. This paper presents a study of the effect of number and layout of the knot points in KLS on the capability to represent complex topologies in single and multi-component structures. Image matching error metrics are employed to assess the degree of mismatch between target topologies and those best-attainable via KLS. Results are presented in a catalogue-style in order to facilitate appropriate selection of knot-points by designers wishing to apply KLS for topology optimization.
Application-Tailored Optimization Methods
 
[u'803', u'108']
[0.074864227709044301, 0.54136390292430325]

In [33]:
# get phrase features for classification step
for p in p_data:
    
    i=int(p['index'])
    
    vec=texts_vec[i]
    z=sum(vec)
    vec/=z
    vec*=len(super_phrase_list[i])
    word_bag={}
    
    for j in super_phrase_list[i]:
        word_bag[str(phrase_index[j])]=vec[phrase_index[j]]
    
    p['phrases']=word_bag
    p['phrases_size']=len(super_phrase_list[i])
    
index_phrase={}
for key in phrase_index:
    index_phrase[phrase_index[key]]=key

In [34]:
# sanity check
from pprint import pprint

print index_phrase[1]

pprint (p_data[0]['phrases'])
print p_data[0]['phrases_size']
print (p_data[0]['abstract'])


level set
{'0': 0.9339053662077913,
 '1': 10.759764765407283,
 '10': 1.3113127542104279,
 '1063': 1.0858289956121243,
 '11': 1.3924718729894507,
 '12': 0.39921368386123585,
 '13': 1.245826129223383,
 '14': 0.66861008191347704,
 '148': 0.13901432944488251,
 '15': 4.80960966668406,
 '16': 1.4992461769088636,
 '17': 1.0504948227808346,
 '18': 1.1021721405981735,
 '19': 1.3358475686034921,
 '2': 0.93510152842312977,
 '20': 4.4773586025693666,
 '21': 4.9670361122383335,
 '22': 1.591756459469672,
 '23': 1.9548439881130657,
 '24': 1.1982488153974169,
 '25': 1.1135204182619747,
 '26': 1.8916376523928693,
 '263': 4.0084507323658407,
 '27': 0.96977185627747253,
 '28': 1.1019208548374571,
 '29': 0.71900984511049382,
 '3': 6.1484370088041613,
 '30': 0.43187125512703067,
 '31': 0.91087322529178505,
 '32': 0.5465678915762201,
 '33': 0.84198359100644438,
 '34': 0.54008356591118112,
 '35': 1.3217879439255495,
 '36': 0.82180483686758976,
 '362': 0.70222733758398315,
 '37': 1.7268685283242584,
 '38': 1.0318371780482614,
 '39': 8.5103864028251763,
 '4': 0.80338857099165084,
 '40': 1.8411079527252006,
 '41': 1.8885483845798636,
 '42': 1.1374191276677916,
 '43': 0.35234483398418648,
 '44': 5.5676020913098734,
 '45': 0.33726633707155701,
 '46': 7.5047480564780251,
 '5': 14.89317620494406,
 '502': 4.3923272109409295,
 '6': 0.72265070125573161,
 '7': 1.3437579410638525,
 '731': 1.1565129246760255,
 '76': 1.6658067369723015,
 '8': 2.5015826854926755,
 '9': 1.0702346597368226,
 '98': 3.9569181634880941,
 '995': 2.6679034314271739}
130
In this paper, we propose a level-set based topology optimization method for designing a reactor, which is used as a part of the DC-DC converter in electric and hybrid vehicles. Since it realizes a high-power driving motor and its performance relies on its component, i.e., reactor core, it is valuable to establish a reasonable design method for the reactor core. Boundary tracking type level-set topology optimization is suitable for this purpose, because the shape and topology of the target structure is clearly represented by the zero boundary of the level-set function, and the state variables are accurately computed using the zero boundary tracking mesh. We formulate the design problem on the basis of electromagnetics, and derive the design sensitivities. The derived sensitivities are linked with boundary tracking type level-set topology optimization, and as a result, a useful structural optimization method for the reactor core design problem is developed.

In [35]:
super_data={}
super_data['papers']=p_data
super_data['index_phrase']=index_phrase


path="data/super_data.json"
if(os.path.isfile(path)):
    os.remove(path)
with open(path, "w") as f:
    json.dump(super_data, f)

In [ ]: