diff --git a/.gitignore b/.gitignore
index 264daca..2e04467 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-*__pycache__
\ No newline at end of file
+*__pycache__
+*.ipynb
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..f2d6bb9
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "GLIP"]
+	path = GLIP
+	url = https://github.com/Junital/GLIP.git
diff --git a/GLIP b/GLIP
new file mode 160000
index 0000000..c43038f
--- /dev/null
+++ b/GLIP
@@ -0,0 +1 @@
+Subproject commit c43038f7a96e5515c125a2faa93bdf7ff6934a9f
diff --git a/assets/COCO_train2014_000000022882.jpg b/assets/COCO_train2014_000000022882.jpg
new file mode 100644
index 0000000..be36b62
Binary files /dev/null and b/assets/COCO_train2014_000000022882.jpg differ
diff --git a/assets/COCO_train2014_000000269022.jpg b/assets/COCO_train2014_000000269022.jpg
new file mode 100644
index 0000000..4bec226
Binary files /dev/null and b/assets/COCO_train2014_000000269022.jpg differ
diff --git a/assets/COCO_train2014_000000576849.jpg b/assets/COCO_train2014_000000576849.jpg
new file mode 100644
index 0000000..54c3037
Binary files /dev/null and b/assets/COCO_train2014_000000576849.jpg differ
diff --git a/data/ExperImages.zip b/data/ExperImages.zip
new file mode 100644
index 0000000..33ccd00
Binary files /dev/null and b/data/ExperImages.zip differ
diff --git a/data/ExperImages/n103484.jpg b/data/ExperImages/n103484.jpg
new file mode 100644
index 0000000..38d92b0
Binary files /dev/null and b/data/ExperImages/n103484.jpg differ
diff --git a/data/ExperImages/n109961.jpg b/data/ExperImages/n109961.jpg
new file mode 100644
index 0000000..f3773cf
Binary files /dev/null and b/data/ExperImages/n109961.jpg differ
diff --git a/data/ExperImages/n111074.jpg b/data/ExperImages/n111074.jpg
new file mode 100644
index 0000000..ec71a56
Binary files /dev/null and b/data/ExperImages/n111074.jpg differ
diff --git a/data/ExperImages/n115871.jpg b/data/ExperImages/n115871.jpg
new file mode 100644
index 0000000..a64afba
Binary files /dev/null and b/data/ExperImages/n115871.jpg differ
diff --git a/data/ExperImages/n130253.jpg b/data/ExperImages/n130253.jpg
new file mode 100644
index 0000000..38fda3f
Binary files /dev/null and b/data/ExperImages/n130253.jpg differ
diff --git a/data/ExperImages/n130371.jpg b/data/ExperImages/n130371.jpg
new file mode 100644
index 0000000..3ad0334
Binary files /dev/null and b/data/ExperImages/n130371.jpg differ
diff --git a/data/ExperImages/n133975.jpg b/data/ExperImages/n133975.jpg
new file mode 100644
index 0000000..38a5d7c
Binary files /dev/null and b/data/ExperImages/n133975.jpg differ
diff --git a/data/ExperImages/n139491.jpg b/data/ExperImages/n139491.jpg
new file mode 100644
index 0000000..10b8ecb
Binary files /dev/null and b/data/ExperImages/n139491.jpg differ
diff --git a/data/ExperImages/n147001.jpg b/data/ExperImages/n147001.jpg
new file mode 100644
index 0000000..0230873
Binary files /dev/null and b/data/ExperImages/n147001.jpg differ
diff --git a/data/ExperImages/n149859.jpg b/data/ExperImages/n149859.jpg
new file mode 100644
index 0000000..9b66e60
Binary files /dev/null and b/data/ExperImages/n149859.jpg differ
diff --git a/data/ExperImages/n154501.jpg b/data/ExperImages/n154501.jpg
new file mode 100644
index 0000000..d3bf565
Binary files /dev/null and b/data/ExperImages/n154501.jpg differ
diff --git a/data/ExperImages/n155297.jpg b/data/ExperImages/n155297.jpg
new file mode 100644
index 0000000..2a3e3bd
Binary files /dev/null and b/data/ExperImages/n155297.jpg differ
diff --git a/data/ExperImages/n15719.jpg b/data/ExperImages/n15719.jpg
new file mode 100644
index 0000000..739dc36
Binary files /dev/null and b/data/ExperImages/n15719.jpg differ
diff --git a/data/ExperImages/n158542.jpg b/data/ExperImages/n158542.jpg
new file mode 100644
index 0000000..4ffcefc
Binary files /dev/null and b/data/ExperImages/n158542.jpg differ
diff --git a/data/ExperImages/n170047.jpg b/data/ExperImages/n170047.jpg
new file mode 100644
index 0000000..ff913a5
Binary files /dev/null and b/data/ExperImages/n170047.jpg differ
diff --git a/data/ExperImages/n171693.jpg b/data/ExperImages/n171693.jpg
new file mode 100644
index 0000000..293c803
Binary files /dev/null and b/data/ExperImages/n171693.jpg differ
diff --git a/data/ExperImages/n173361.jpg b/data/ExperImages/n173361.jpg
new file mode 100644
index 0000000..336f662
Binary files /dev/null and b/data/ExperImages/n173361.jpg differ
diff --git a/data/ExperImages/n175869.jpg b/data/ExperImages/n175869.jpg
new file mode 100644
index 0000000..8419275
Binary files /dev/null and b/data/ExperImages/n175869.jpg differ
diff --git a/data/ExperImages/n178815.jpg b/data/ExperImages/n178815.jpg
new file mode 100644
index 0000000..e12f8a8
Binary files /dev/null and b/data/ExperImages/n178815.jpg differ
diff --git a/data/ExperImages/n181615.jpg b/data/ExperImages/n181615.jpg
new file mode 100644
index 0000000..1362abc
Binary files /dev/null and b/data/ExperImages/n181615.jpg differ
diff --git a/data/ExperImages/n182120.jpg b/data/ExperImages/n182120.jpg
new file mode 100644
index 0000000..11d2cdd
Binary files /dev/null and b/data/ExperImages/n182120.jpg differ
diff --git a/data/ExperImages/n184739.jpg b/data/ExperImages/n184739.jpg
new file mode 100644
index 0000000..59b586c
Binary files /dev/null and b/data/ExperImages/n184739.jpg differ
diff --git a/data/ExperImages/n188669.jpg b/data/ExperImages/n188669.jpg
new file mode 100644
index 0000000..3b94d55
Binary files /dev/null and b/data/ExperImages/n188669.jpg differ
diff --git a/data/ExperImages/n189986.jpg b/data/ExperImages/n189986.jpg
new file mode 100644
index 0000000..224be63
Binary files /dev/null and b/data/ExperImages/n189986.jpg differ
diff --git a/data/ExperImages/n194711.jpg b/data/ExperImages/n194711.jpg
new file mode 100644
index 0000000..d388188
Binary files /dev/null and b/data/ExperImages/n194711.jpg differ
diff --git a/data/ExperImages/n196089.jpg b/data/ExperImages/n196089.jpg
new file mode 100644
index 0000000..9622186
Binary files /dev/null and b/data/ExperImages/n196089.jpg differ
diff --git a/data/ExperImages/n20290.jpg b/data/ExperImages/n20290.jpg
new file mode 100644
index 0000000..0e73bec
Binary files /dev/null and b/data/ExperImages/n20290.jpg differ
diff --git a/data/ExperImages/n210059.jpg b/data/ExperImages/n210059.jpg
new file mode 100644
index 0000000..b503d11
Binary files /dev/null and b/data/ExperImages/n210059.jpg differ
diff --git a/data/ExperImages/n210277.jpg b/data/ExperImages/n210277.jpg
new file mode 100644
index 0000000..a5d91cc
Binary files /dev/null and b/data/ExperImages/n210277.jpg differ
diff --git a/data/ExperImages/n214414.jpg b/data/ExperImages/n214414.jpg
new file mode 100644
index 0000000..027767f
Binary files /dev/null and b/data/ExperImages/n214414.jpg differ
diff --git a/data/ExperImages/n215517.jpg b/data/ExperImages/n215517.jpg
new file mode 100644
index 0000000..e0e67bc
Binary files /dev/null and b/data/ExperImages/n215517.jpg differ
diff --git a/data/ExperImages/n229656.jpg b/data/ExperImages/n229656.jpg
new file mode 100644
index 0000000..01fa258
Binary files /dev/null and b/data/ExperImages/n229656.jpg differ
diff --git a/data/ExperImages/n239383.jpg b/data/ExperImages/n239383.jpg
new file mode 100644
index 0000000..3c147a1
Binary files /dev/null and b/data/ExperImages/n239383.jpg differ
diff --git a/data/ExperImages/n24913.jpg b/data/ExperImages/n24913.jpg
new file mode 100644
index 0000000..9b6c98e
Binary files /dev/null and b/data/ExperImages/n24913.jpg differ
diff --git a/data/ExperImages/n253231.jpg b/data/ExperImages/n253231.jpg
new file mode 100644
index 0000000..3c31c2c
Binary files /dev/null and b/data/ExperImages/n253231.jpg differ
diff --git a/data/ExperImages/n258003.jpg b/data/ExperImages/n258003.jpg
new file mode 100644
index 0000000..82b34f0
Binary files /dev/null and b/data/ExperImages/n258003.jpg differ
diff --git a/data/ExperImages/n261100.jpg b/data/ExperImages/n261100.jpg
new file mode 100644
index 0000000..c43ab19
Binary files /dev/null and b/data/ExperImages/n261100.jpg differ
diff --git a/data/ExperImages/n264509.jpg b/data/ExperImages/n264509.jpg
new file mode 100644
index 0000000..e734049
Binary files /dev/null and b/data/ExperImages/n264509.jpg differ
diff --git a/data/ExperImages/n266971.jpg b/data/ExperImages/n266971.jpg
new file mode 100644
index 0000000..e170f05
Binary files /dev/null and b/data/ExperImages/n266971.jpg differ
diff --git a/data/ExperImages/n267826.jpg b/data/ExperImages/n267826.jpg
new file mode 100644
index 0000000..64e7161
Binary files /dev/null and b/data/ExperImages/n267826.jpg differ
diff --git a/data/ExperImages/n274318.jpg b/data/ExperImages/n274318.jpg
new file mode 100644
index 0000000..3dd7a7f
Binary files /dev/null and b/data/ExperImages/n274318.jpg differ
diff --git a/data/ExperImages/n275523.jpg b/data/ExperImages/n275523.jpg
new file mode 100644
index 0000000..0907916
Binary files /dev/null and b/data/ExperImages/n275523.jpg differ
diff --git a/data/ExperImages/n287162.jpg b/data/ExperImages/n287162.jpg
new file mode 100644
index 0000000..4a2c104
Binary files /dev/null and b/data/ExperImages/n287162.jpg differ
diff --git a/data/ExperImages/n298165.jpg b/data/ExperImages/n298165.jpg
new file mode 100644
index 0000000..68e53d4
Binary files /dev/null and b/data/ExperImages/n298165.jpg differ
diff --git a/data/ExperImages/n299577.jpg b/data/ExperImages/n299577.jpg
new file mode 100644
index 0000000..6844b44
Binary files /dev/null and b/data/ExperImages/n299577.jpg differ
diff --git a/data/ExperImages/n307753.jpg b/data/ExperImages/n307753.jpg
new file mode 100644
index 0000000..6ffd362
Binary files /dev/null and b/data/ExperImages/n307753.jpg differ
diff --git a/data/ExperImages/n311711.jpg b/data/ExperImages/n311711.jpg
new file mode 100644
index 0000000..3d01a7c
Binary files /dev/null and b/data/ExperImages/n311711.jpg differ
diff --git a/data/ExperImages/n312992.jpg b/data/ExperImages/n312992.jpg
new file mode 100644
index 0000000..45fa9dc
Binary files /dev/null and b/data/ExperImages/n312992.jpg differ
diff --git a/data/ExperImages/n318563.jpg b/data/ExperImages/n318563.jpg
new file mode 100644
index 0000000..757b186
Binary files /dev/null and b/data/ExperImages/n318563.jpg differ
diff --git a/data/ExperImages/n324512.jpg b/data/ExperImages/n324512.jpg
new file mode 100644
index 0000000..8fc1d1e
Binary files /dev/null and b/data/ExperImages/n324512.jpg differ
diff --git a/data/ExperImages/n339300.jpg b/data/ExperImages/n339300.jpg
new file mode 100644
index 0000000..2ad9807
Binary files /dev/null and b/data/ExperImages/n339300.jpg differ
diff --git a/data/ExperImages/n339728.jpg b/data/ExperImages/n339728.jpg
new file mode 100644
index 0000000..8d04212
Binary files /dev/null and b/data/ExperImages/n339728.jpg differ
diff --git a/data/ExperImages/n341278.jpg b/data/ExperImages/n341278.jpg
new file mode 100644
index 0000000..16b119b
Binary files /dev/null and b/data/ExperImages/n341278.jpg differ
diff --git a/data/ExperImages/n343034.jpg b/data/ExperImages/n343034.jpg
new file mode 100644
index 0000000..264d7cd
Binary files /dev/null and b/data/ExperImages/n343034.jpg differ
diff --git a/data/ExperImages/n345315.jpg b/data/ExperImages/n345315.jpg
new file mode 100644
index 0000000..cd90c7b
Binary files /dev/null and b/data/ExperImages/n345315.jpg differ
diff --git a/data/ExperImages/n345363.jpg b/data/ExperImages/n345363.jpg
new file mode 100644
index 0000000..1dc750b
Binary files /dev/null and b/data/ExperImages/n345363.jpg differ
diff --git a/data/ExperImages/n349224.jpg b/data/ExperImages/n349224.jpg
new file mode 100644
index 0000000..493ad91
Binary files /dev/null and b/data/ExperImages/n349224.jpg differ
diff --git a/data/ExperImages/n366949.jpg b/data/ExperImages/n366949.jpg
new file mode 100644
index 0000000..5574ba2
Binary files /dev/null and b/data/ExperImages/n366949.jpg differ
diff --git a/data/ExperImages/n383044.jpg b/data/ExperImages/n383044.jpg
new file mode 100644
index 0000000..78d999c
Binary files /dev/null and b/data/ExperImages/n383044.jpg differ
diff --git a/data/ExperImages/n394813.jpg b/data/ExperImages/n394813.jpg
new file mode 100644
index 0000000..76fdb55
Binary files /dev/null and b/data/ExperImages/n394813.jpg differ
diff --git a/data/ExperImages/n406179.jpg b/data/ExperImages/n406179.jpg
new file mode 100644
index 0000000..fd8003a
Binary files /dev/null and b/data/ExperImages/n406179.jpg differ
diff --git a/data/ExperImages/n408516.jpg b/data/ExperImages/n408516.jpg
new file mode 100644
index 0000000..5ad0d9f
Binary files /dev/null and b/data/ExperImages/n408516.jpg differ
diff --git a/data/ExperImages/n409008.jpg b/data/ExperImages/n409008.jpg
new file mode 100644
index 0000000..ed994af
Binary files /dev/null and b/data/ExperImages/n409008.jpg differ
diff --git a/data/ExperImages/n41686.jpg b/data/ExperImages/n41686.jpg
new file mode 100644
index 0000000..24e529e
Binary files /dev/null and b/data/ExperImages/n41686.jpg differ
diff --git a/data/ExperImages/n424704.jpg b/data/ExperImages/n424704.jpg
new file mode 100644
index 0000000..6dd7044
Binary files /dev/null and b/data/ExperImages/n424704.jpg differ
diff --git a/data/ExperImages/n437038.jpg b/data/ExperImages/n437038.jpg
new file mode 100644
index 0000000..fc327b0
Binary files /dev/null and b/data/ExperImages/n437038.jpg differ
diff --git a/data/ExperImages/n464936.jpg b/data/ExperImages/n464936.jpg
new file mode 100644
index 0000000..677b950
Binary files /dev/null and b/data/ExperImages/n464936.jpg differ
diff --git a/data/ExperImages/n466504.jpg b/data/ExperImages/n466504.jpg
new file mode 100644
index 0000000..6a877bb
Binary files /dev/null and b/data/ExperImages/n466504.jpg differ
diff --git a/data/ExperImages/n468138.jpg b/data/ExperImages/n468138.jpg
new file mode 100644
index 0000000..2396fcd
Binary files /dev/null and b/data/ExperImages/n468138.jpg differ
diff --git a/data/ExperImages/n474949.jpg b/data/ExperImages/n474949.jpg
new file mode 100644
index 0000000..3c7c8e3
Binary files /dev/null and b/data/ExperImages/n474949.jpg differ
diff --git a/data/ExperImages/n475122.jpg b/data/ExperImages/n475122.jpg
new file mode 100644
index 0000000..d438334
Binary files /dev/null and b/data/ExperImages/n475122.jpg differ
diff --git a/data/ExperImages/n479684.jpg b/data/ExperImages/n479684.jpg
new file mode 100644
index 0000000..b8cc80e
Binary files /dev/null and b/data/ExperImages/n479684.jpg differ
diff --git a/data/ExperImages/n487547.jpg b/data/ExperImages/n487547.jpg
new file mode 100644
index 0000000..ce59d37
Binary files /dev/null and b/data/ExperImages/n487547.jpg differ
diff --git a/data/ExperImages/n488098.jpg b/data/ExperImages/n488098.jpg
new file mode 100644
index 0000000..5ad32a3
Binary files /dev/null and b/data/ExperImages/n488098.jpg differ
diff --git a/data/ExperImages/n49911.jpg b/data/ExperImages/n49911.jpg
new file mode 100644
index 0000000..213dc3f
Binary files /dev/null and b/data/ExperImages/n49911.jpg differ
diff --git a/data/ExperImages/n511793.jpg b/data/ExperImages/n511793.jpg
new file mode 100644
index 0000000..08afe81
Binary files /dev/null and b/data/ExperImages/n511793.jpg differ
diff --git a/data/ExperImages/n51303.jpg b/data/ExperImages/n51303.jpg
new file mode 100644
index 0000000..f94f76b
Binary files /dev/null and b/data/ExperImages/n51303.jpg differ
diff --git a/data/ExperImages/n513747.jpg b/data/ExperImages/n513747.jpg
new file mode 100644
index 0000000..2f3001a
Binary files /dev/null and b/data/ExperImages/n513747.jpg differ
diff --git a/data/ExperImages/n514077.jpg b/data/ExperImages/n514077.jpg
new file mode 100644
index 0000000..78d4fba
Binary files /dev/null and b/data/ExperImages/n514077.jpg differ
diff --git a/data/ExperImages/n515157.jpg b/data/ExperImages/n515157.jpg
new file mode 100644
index 0000000..049813a
Binary files /dev/null and b/data/ExperImages/n515157.jpg differ
diff --git a/data/ExperImages/n524673.jpg b/data/ExperImages/n524673.jpg
new file mode 100644
index 0000000..76187e3
Binary files /dev/null and b/data/ExperImages/n524673.jpg differ
diff --git a/data/ExperImages/n525013.jpg b/data/ExperImages/n525013.jpg
new file mode 100644
index 0000000..0a1555b
Binary files /dev/null and b/data/ExperImages/n525013.jpg differ
diff --git a/data/ExperImages/n536090.jpg b/data/ExperImages/n536090.jpg
new file mode 100644
index 0000000..42fae4d
Binary files /dev/null and b/data/ExperImages/n536090.jpg differ
diff --git a/data/ExperImages/n537813.jpg b/data/ExperImages/n537813.jpg
new file mode 100644
index 0000000..d11b18a
Binary files /dev/null and b/data/ExperImages/n537813.jpg differ
diff --git a/data/ExperImages/n544799.jpg b/data/ExperImages/n544799.jpg
new file mode 100644
index 0000000..3a92f44
Binary files /dev/null and b/data/ExperImages/n544799.jpg differ
diff --git a/data/ExperImages/n550668.jpg b/data/ExperImages/n550668.jpg
new file mode 100644
index 0000000..5b9d4a4
Binary files /dev/null and b/data/ExperImages/n550668.jpg differ
diff --git a/data/ExperImages/n553018.jpg b/data/ExperImages/n553018.jpg
new file mode 100644
index 0000000..999fba3
Binary files /dev/null and b/data/ExperImages/n553018.jpg differ
diff --git a/data/ExperImages/n554025.jpg b/data/ExperImages/n554025.jpg
new file mode 100644
index 0000000..c91c4c9
Binary files /dev/null and b/data/ExperImages/n554025.jpg differ
diff --git a/data/ExperImages/n557683.jpg b/data/ExperImages/n557683.jpg
new file mode 100644
index 0000000..83581d2
Binary files /dev/null and b/data/ExperImages/n557683.jpg differ
diff --git a/data/ExperImages/n560895.jpg b/data/ExperImages/n560895.jpg
new file mode 100644
index 0000000..4576ba5
Binary files /dev/null and b/data/ExperImages/n560895.jpg differ
diff --git a/data/ExperImages/n56556.jpg b/data/ExperImages/n56556.jpg
new file mode 100644
index 0000000..b526630
Binary files /dev/null and b/data/ExperImages/n56556.jpg differ
diff --git a/data/ExperImages/n565573.jpg b/data/ExperImages/n565573.jpg
new file mode 100644
index 0000000..b3922c2
Binary files /dev/null and b/data/ExperImages/n565573.jpg differ
diff --git a/data/ExperImages/n568258.jpg b/data/ExperImages/n568258.jpg
new file mode 100644
index 0000000..b9c1330
Binary files /dev/null and b/data/ExperImages/n568258.jpg differ
diff --git a/data/ExperImages/n59657.jpg b/data/ExperImages/n59657.jpg
new file mode 100644
index 0000000..9e98dd6
Binary files /dev/null and b/data/ExperImages/n59657.jpg differ
diff --git a/data/ExperImages/n59853.jpg b/data/ExperImages/n59853.jpg
new file mode 100644
index 0000000..2cf18cb
Binary files /dev/null and b/data/ExperImages/n59853.jpg differ
diff --git a/data/ExperImages/n60178.jpg b/data/ExperImages/n60178.jpg
new file mode 100644
index 0000000..5ee3f22
Binary files /dev/null and b/data/ExperImages/n60178.jpg differ
diff --git a/data/ExperImages/n61019.jpg b/data/ExperImages/n61019.jpg
new file mode 100644
index 0000000..5d026f8
Binary files /dev/null and b/data/ExperImages/n61019.jpg differ
diff --git a/data/ExperImages/n66626.jpg b/data/ExperImages/n66626.jpg
new file mode 100644
index 0000000..c046837
Binary files /dev/null and b/data/ExperImages/n66626.jpg differ
diff --git a/data/ExperImages/n93063.jpg b/data/ExperImages/n93063.jpg
new file mode 100644
index 0000000..f1d7c68
Binary files /dev/null and b/data/ExperImages/n93063.jpg differ
diff --git a/data/problem.xlsx b/data/problem.xlsx
new file mode 100644
index 0000000..f1e7cf4
Binary files /dev/null and b/data/problem.xlsx differ
diff --git a/data/test.csv b/data/test.csv
new file mode 100644
index 0000000..3662a71
--- /dev/null
+++ b/data/test.csv
@@ -0,0 +1,101 @@
+isBalanced,question,imageId
+True,Which kind of furniture is below the decoration?,n274318
+True,Which kind of clothing is warm?,n115871
+True,Which kind of clothing is not black?,n184739
+True,Do the pants that are not dirty look large?,n66626
+True,The lamp that is not turned-off is sitting on top of what?,n210059
+True,What kind of furniture is made of metal?,n188669
+True,Is the shirt orange or blue?,n554025
+True,Do you see either any white towels or pillows?,n468138
+True,Is the closed drawer to the right of a chair?,n15719
+True,What is the lamp made of?,n133975
+True,What does the man wear?,n51303
+True,What is under the cooking utensil made of wood?,n349224
+True,Are the end table and the desk made of the same material?,n194711
+True,Is the counter above a drawer?,n229656
+True,Are the pants black and long?,n149859
+True,What is the cup to the left of the keyboard made of?,n475122
+True,Is the standing woman behind the tomatoes wearing a hat?,n299577
+True,Where is the man in front of the fence standing on?,n464936
+True,What appliance is not used?,n409008
+True,Do you see any women inside the library?,n93063
+True,How large is the device the computer monitor is beside of?,n479684
+True,The bun is on what?,n24913
+True,What is the boy doing?,n49911
+True,Is there a bed or a desk in this picture?,n170047
+True,Does the gate look metallic and tall?,n196089
+True,Does the mirror look clean and brown?,n266971
+True,What is in front of the street light?,n511793
+True,Is the helmet on the right?,n339728
+True,Are there white desks in the picture?,n345363
+True,Is the house both white and small?,n109961
+True,Is the bag behind a chair?,n214414
+True,Is the garbage can to the left of the people?,n147001
+True,Are the benches in front of the person hard and red?,n324512
+True,Is there a train above the street that is made of brick?,n253231
+True,The tomatoes are in what?,n557683
+True,Is the plate on a counter?,n318563
+True,Are there any women in the photo that are not riding?,n59657
+True,What is the piece of furniture that is hanging above the wall called?,n61019
+True,Is the plastic bucket to the right or to the left of the toilet that is white?,n171693
+True,What is the man wearing?,n275523
+True,Is there a blue window in this picture?,n189986
+True,Does the dress look short sleeved?,n515157
+True,How large is the pitcher that the ketchup is in front of?,n158542
+True,What is in front of the trees?,n139491
+True,Does the person that is not old wear a hat?,n560895
+True,Is the happy person on the left or on the right side of the image?,n383044
+True,Is the empty bottle on the right side or on the left?,n60178
+True,Which kind of clothing is light colored?,n111074
+True,Do you see tables in the photo?,n154501
+True,What animal is standing on the small boat?,n178815
+True,What is the plate near the candle holder sitting atop?,n264509
+True,Which place is it?,n424704
+True,What is the device in front of the person that is sitting on the ground called?,n20290
+True,Do you see end tables next to the sofa on the left part of the photo?,n307753
+True,What is under the windows?,n341278
+True,Does that sweatshirt have striped pattern and gray color?,n345315
+True,What is the color of the toilet paper?,n287162
+True,What's the man doing?,n513747
+True,Which device is on?,n59853
+True,Are there black sandals or boots?,n41686
+True,Is the plastic cup to the left of the other cup small and colorful?,n215517
+True,What kind of furniture are the drawers in?,n133975
+True,Are the men to the left of a bowl?,n298165
+True,What animal is brown?,n406179
+True,What is the color of the wine glasses?,n544799
+True,Are there both cars and fences in the photo?,n130253
+True,Are there either any plates or breads in the image?,n267826
+True,What is on the motorcycle?,n339300
+True,How large are the rocks?,n408516
+True,Which kind of food is bigger than the blueberry?,n466504
+True,What is the woman wearing?,n537813
+True,What does the man wear?,n366949
+True,"Which side of the picture are the bags on, the left or the right?",n568258
+True,Where is the snowboarder standing on?,n553018
+True,How is this vehicle called?,n487547
+True,Who is holding onto the umbrella?,n103484
+True,What is the toilet brush next to the toilet made of?,n514077
+True,Who in this image is talking?,n343034
+True,What type of fast food is the woman that is not old looking at?,n394813
+True,Are the daughter that is not old and the daughter to the right of the dad both happy?,n155297
+True,Are the people that are talking leaning on the wood fence?,n181615
+True,Is the clock both round and gold?,n239383
+True,Is the girl to the left of the backpack waiting or playing?,n437038
+True,Is the table gray?,n56556
+True,What is the color of the bread that is not little?,n312992
+True,What is in front of the trees?,n536090
+True,Is that street sign black and small?,n311711
+True,Is that fork made of stainless steel?,n474949
+True,What is the weather like in the picture?,n130371
+True,Is there a mug on top of the bench?,n261100
+True,Is the sand both black and wet?,n175869
+True,What is the aircraft that the woman is looking at?,n524673
+True,What pieces of furniture are to the left of the car?,n173361
+True,Who in this photograph is looking down?,n488098
+True,Is the sink short or tall?,n182120
+True,What is that fence in front of?,n258003
+True,Are there any chairs near the decorative painting?,n210277
+True,How clean is the headband that the girl is wearing?,n525013
+True,Is the small building behind or in front of the bushy tree?,n550668
+True,Does the umbrella look female and white?,n565573
diff --git a/data/visualization.xlsx b/data/visualization.xlsx
new file mode 100644
index 0000000..97de500
Binary files /dev/null and b/data/visualization.xlsx differ
diff --git a/download_GLIP.sh b/download_GLIP.sh
new file mode 100644
index 0000000..6e78006
--- /dev/null
+++ b/download_GLIP.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# change this to your preferred download location
+PRETRAINED_MODELS_PATH=./pretrained_models
+
+# GLIP model
+mkdir -p $PRETRAINED_MODELS_PATH/GLIP/checkpoints
+mkdir -p $PRETRAINED_MODELS_PATH/GLIP/configs
+wget -nc -P $PRETRAINED_MODELS_PATH/GLIP/checkpoints https://huggingface.co/GLIPModel/GLIP/resolve/main/glip_large_model.pth
+wget -nc -P $PRETRAINED_MODELS_PATH/GLIP/configs https://raw.githubusercontent.com/microsoft/GLIP/main/configs/pretrain/glip_Swin_L.yaml
diff --git a/engine/object_graph.py b/engine/object_graph.py
new file mode 100644
index 0000000..ed7c593
--- /dev/null
+++ b/engine/object_graph.py
@@ -0,0 +1,39 @@
+class ObjectGraph:
+
+    def __init__(self, Name: str, Box, 
+                 Category : str = "object", 
+                 Location : tuple = (0, 0), 
+                 Size : tuple = (0, 0)) -> None:
+        
+        self.Attribute = dict(
+            Name = Name,
+            Box = Box,
+            Category = Category,
+            Location = Location,
+            Size = Size
+        )
+
+    def add(self, key: str, value: any) -> None:
+
+        self.Attribute[key] = value
+
+class ObjectGraphGroup:
+
+    def __init__(self, img=None, groupA=None, groupB=None):
+        if img is not None:
+            self.Graphs = []
+            self.Relations = dict()
+            self.Img = img
+        elif groupA is not None and groupB is not None:
+            self.Graphs = groupA.Graphs + groupB.Graphs
+            self.Relations = groupA.Relations | groupB.Relations
+            self.Img = groupA.Img
+        else:
+            raise ValueError("You must give a image or give two groups.")
+
+    def add_graph(self, graph: ObjectGraph):
+        
+        self.Graphs.append(graph)
+
+    def add_relation(self, objA, objB, relation):
+        self.Relations[(objA, objB)] = relation
diff --git a/engine/step_interpreters.py b/engine/step_interpreters.py
index c588497..3f0584a 100644
--- a/engine/step_interpreters.py
+++ b/engine/step_interpreters.py
@@ -1,7 +1,10 @@
 import cv2
 import os
 import torch
-import openai
+import sys
+import timeit
+from typing import Union
+from openai import OpenAI
 import functools
 import numpy as np
 import face_detection
@@ -11,9 +14,14 @@
 from PIL import Image,ImageDraw,ImageFont,ImageFilter
 from transformers import (ViltProcessor, ViltForQuestionAnswering, 
     OwlViTProcessor, OwlViTForObjectDetection,
+    AutoModelForZeroShotObjectDetection, DetrImageProcessor, DetrForObjectDetection,
     MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation,
     CLIPProcessor, CLIPModel, AutoProcessor, BlipForQuestionAnswering)
 from diffusers import StableDiffusionInpaintPipeline
+from .object_graph import ObjectGraph, ObjectGraphGroup
+import re
+from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo, to_image_list, create_positive_map, \
+            create_positive_map_label_to_token_from_positive_map
 
 from .nms import nms
 from vis_utils import html_embed_image, html_colored_span, vis_masks
@@ -191,18 +199,483 @@ def execute(self,prog_step,inspect=False):
 
         return answer
 
+class BuildInterpreter():
+    step_name = "BUILD"
+    
+    def __init__(self):
+        self.vqa = VQAInterpreter()
+        
+        self.crop = CropInterpreter()
+
+    def parse(self,prog_step):
+        parse_result = parse_step(prog_step.prog_str)
+        step_name = parse_result['step_name']
+        objs_var = parse_result['args']['objects']
+        output_var = parse_result['output_var']
+        assert(step_name==self.step_name)
+        return objs_var,output_var
+    
+    def execute(self, prog_step,inspect=False):
+        objs_var, output_var = self.parse(prog_step)
+
+        objs = prog_step.state[objs_var]
+        img = objs["img"]
+        category = objs["category"]
+
+        graphs = ObjectGraphGroup(img)
+
+        for obj in objs["box"]:
+            croped_img = self.crop.cropbox(obj, img)
+            
+            answer = self.vqa.predict(croped_img, f"What's this {category}?")
+            
+            mid_w, mid_h = (obj[0] + obj[2]) / 2, (obj[1] + obj[3]) / 2
+            w, h = img.size
+            location = (mid_w / w, mid_h / h)
+            size = (obj[2] - obj[0]), obj[3] - obj[1]
+
+            graphs.add_graph(ObjectGraph(answer, obj, category, location, size))
+
+        print(graphs.Graphs)
+        prog_step.state[output_var] = graphs
+
+class ADDInterpreter():
+    step_name = 'ADD'
+
+    def __init__(self):
+        self.vqa = VQAInterpreter()
+
+    def parse(self,prog_step):
+        parse_result = parse_step(prog_step.prog_str)
+        step_name = parse_result['step_name']
+        graph_var = parse_result['args']['graph']
+        attribute_str = parse_result['args']['attribute']
+        output_var = parse_result['output_var']
+        assert(step_name==self.step_name)
+        return graph_var,attribute_str,output_var
+    
+    def execute(self, prog_step,inspect=False):
+        graph_var, attribute_str, output_var = self.parse(prog_step)
+
+        graph = prog_step.state[graph_var]
+        img = graph.Img
+
+        res_graph = ObjectGraphGroup(img=img)
+
+        for obj in graph.Graphs:
+            if attribute_str in obj.Attribute:
+                pass
+            else:
+                cropped_img = img.crop(obj.Attribute["Box"])
+                obj_name = obj.Attribute["Name"]
+                answer = self.vqa.predict(cropped_img, f"What's the {attribute_str} of this {obj_name}?")
+
+                obj.add(attribute_str, answer)
+            
+            res_graph.add_graph(obj)
+
+        prog_step.state[output_var] = res_graph
+        
+        return res_graph
+
+class MERGEInterpreter():
+    step_name = 'MERGE'
+
+    RELATION_MESSAGE = [
+    {
+    "role": "system",
+    "content": """Given the relationship (subject, object): relationship, generate a question that asks about this relationship.
+
+For example:
+**Input**: (bottles, wine): right_of 
+**Output**: "Is the bottles to the right of the wine?"
+
+
+3.  should be converted to "Is the book under the table?"
+
+Please follow this format to create the questions.
+"""
+    },
+    {
+    "role": "user",
+    "content": """**Input:** (people, umbrella): carry"""
+    },
+    {
+    "role": "assistant",
+    "content": """**Output:** "Is the people carrying the umbrella?" """
+    },
+    {
+    "role": "user",
+    "content": """**Input:** (book, table): under"""
+    },
+    {
+    "role": "assistant",
+    "content": """**Output:** "Is the book under the table?" """
+    }
+    ]
+
+    def __init__(self):
+        self.vqa = VQAInterpreter()
+        self.client = OpenAI()
+
+    def parse(self,prog_step):
+        parse_result = parse_step(prog_step.prog_str)
+        step_name = parse_result['step_name']
+        graph1_var = parse_result['args']['graphA']
+        graph2_var = parse_result['args']['graphB']
+        relation_str = parse_result['args']['relation']
+        output_var = parse_result['output_var']
+        assert(step_name==self.step_name)
+        return graph1_var,graph2_var,relation_str,output_var
+    
+    def llm_ask(self, relation):
+        message = self.RELATION_MESSAGE
+        message.append({"role": "user", "content": f"**Input:** {relation}"})
+
+        response = self.client.chat.completions.create(
+            model="gpt-4o",
+            temperature=0.8,
+            messages=message
+        )
+
+        answer = response.choices[0].message.content
+
+        matches = re.findall(r'"([^}]*)"', answer)
+
+        if len(matches) > 0:
+            return matches[0]
+
+        return ""
+    
+    def get_categories(self, group):
+        categories = []
+
+        for graph in group.Graphs:
+            if graph.Attribute["Category"] not in categories:
+                categories.append(graph.Attribute["Category"])
+
+        return categories
+
+    def focus_image(self, img, box1, box2):
+        black_image = Image.new('RGB', img.size, (0, 0, 0))
+
+        region1 = img.crop(box1)
+        black_image.paste(region1, (box1[0], box1[1]))
+
+        region2 = img.crop(box2)
+        black_image.paste(region2, (box2[0], box2[1]))
+
+        return black_image
+
+    def execute(self, prog_step,inspect=False):
+        graph1_var, graph2_var, relation_str, output_var = self.parse(prog_step)
+
+        graph1 = prog_step.state[graph1_var]
+        graph2 = prog_step.state[graph2_var]
+
+        if(relation_str == "None"):
+
+            merged_graph = ObjectGraphGroup(groupA=graph1, groupB=graph2)
+        
+        else:
+            merged_graph = ObjectGraphGroup(groupA=graph1, groupB=graph2)
+            img = merged_graph.Img
+
+            for objA in graph1.Graphs:
+                for objB in graph2.Graphs:
+                    new_img = self.focus_image(img, objA.Attribute["Box"], objB.Attribute["Box"])
+                    new_img.save("test.jpg")
+
+                    Aname = objA.Attribute["Name"]
+                    Bname = objB.Attribute["Name"]
+
+                    question = self.llm_ask(f"({Aname}, {Bname}): {relation_str}")
+                    print(question)
+
+                    answer = self.vqa.predict(new_img, question)
+
+                    if(answer == "yes"):
+                        merged_graph.add_relation(objA, objB, relation_str)
+            
+        prog_step.state[output_var] = merged_graph
+
+        return merged_graph
+    
+class HiddenPrints:
+    hide_prints = False
+
+    def __init__(self, model_name=None, console=None, use_newline=True):
+        self.model_name = model_name
+        self.console = console
+        self.use_newline = use_newline
+        self.tqdm_aux = None
+
+    def __enter__(self):
+        if self.hide_prints:
+            import tqdm  # We need to do an extra step to hide tqdm outputs. Does not work in Jupyter Notebooks.
+
+            def nop(it, *a, **k):
+                return it
+
+            self.tqdm_aux = tqdm.tqdm
+            tqdm.tqdm = nop
+
+            if self.model_name is not None:
+                self.console.print(f'Loading {self.model_name}...')
+            self._original_stdout = sys.stdout
+            self._original_stderr = sys.stderr
+            sys.stdout = open(os.devnull, 'w')
+            # May not be what we always want, but some annoying warnings end up to stderr
+            sys.stderr = open(os.devnull, 'w')
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.hide_prints:
+            sys.stdout.close()
+            sys.stdout = self._original_stdout
+            sys.stdout = self._original_stderr
+            if self.model_name is not None:
+                self.console.print(f'{self.model_name} loaded ')
+            import tqdm
+            tqdm.tqdm = self.tqdm_aux
+
+class GLIPLocInterpreter(GLIPDemo):
+    step_name = 'LOC'
+
+    def __init__(self, *args_demo):
+
+        working_dir = f'./pretrained_models/GLIP/'
+        
+        config_file = working_dir + "configs/glip_Swin_L.yaml"
+        weight_file = working_dir + "checkpoints/glip_large_model.pth"
+
+        kwargs = {
+            'min_image_size': 800,
+            'confidence_threshold': 0.5,
+            'show_mask_heatmaps': False
+        }
+
+        self.dev = "cuda" if torch.cuda.is_available() else "cpu"
+
+        from maskrcnn_benchmark.config import cfg
+
+        # manual override some options
+        cfg.local_rank = 0
+        cfg.num_gpus = 1
+        cfg.merge_from_file(config_file)
+        cfg.merge_from_list(["MODEL.WEIGHT", weight_file])
+        cfg.merge_from_list(["MODEL.DEVICE", self.dev])
+
+        with HiddenPrints("GLIP"), torch.cuda.device(self.dev):
+            from transformers.utils import logging
+            logging.set_verbosity_error()
+            GLIPDemo.__init__(self, cfg, *args_demo, **kwargs)
+        if self.cfg.MODEL.RPN_ARCHITECTURE == "VLDYHEAD":
+            plus = 1
+        else:
+            plus = 0
+        self.plus = plus
+        self.color = 255
+
+    @torch.no_grad()
+    def compute_prediction(self, original_image, original_caption, custom_entity=None):
+        image = self.transforms(original_image)
+        # image = [image, image.permute(0, 2, 1)]
+        image_list = to_image_list(image, self.cfg.DATALOADER.SIZE_DIVISIBILITY)
+        image_list = image_list.to(self.dev)
+        # caption
+        if isinstance(original_caption, list):
+
+            if len(original_caption) > 40:
+                all_predictions = None
+                for loop_num, i in enumerate(range(0, len(original_caption), 40)):
+                    list_step = original_caption[i:i + 40]
+                    prediction_step = self.compute_prediction(original_image, list_step, custom_entity=None)
+                    if all_predictions is None:
+                        all_predictions = prediction_step
+                    else:
+                        # Aggregate predictions
+                        all_predictions.bbox = torch.cat((all_predictions.bbox, prediction_step.bbox), dim=0)
+                        for k in all_predictions.extra_fields:
+                            all_predictions.extra_fields[k] = \
+                                torch.cat((all_predictions.extra_fields[k],
+                                            prediction_step.extra_fields[k] + loop_num), dim=0)
+                return all_predictions
+
+            # we directly provided a list of category names
+            caption_string = ""
+            tokens_positive = []
+            seperation_tokens = " . "
+            for word in original_caption:
+                tokens_positive.append([len(caption_string), len(caption_string) + len(word)])
+                caption_string += word
+                caption_string += seperation_tokens
+
+            tokenized = self.tokenizer([caption_string], return_tensors="pt")
+            # tokens_positive = [tokens_positive]  # This was wrong
+            tokens_positive = [[v] for v in tokens_positive]
+
+            original_caption = caption_string
+            # print(tokens_positive)
+        else:
+            tokenized = self.tokenizer([original_caption], return_tensors="pt")
+            if custom_entity is None:
+                tokens_positive = self.run_ner(original_caption)
+            # print(tokens_positive)
+        # process positive map
+        positive_map = create_positive_map(tokenized, tokens_positive)
+
+        positive_map_label_to_token = create_positive_map_label_to_token_from_positive_map(positive_map,
+                                                                                            plus=self.plus)
+        self.positive_map_label_to_token = positive_map_label_to_token
+        tic = timeit.time.perf_counter()
+
+        # compute predictions
+        with HiddenPrints():  # Hide some deprecated notices
+            predictions = self.model(image_list, captions=[original_caption],
+                                        positive_map=positive_map_label_to_token)
+        predictions = [o.to(self.cpu_device) for o in predictions]
+        # print("inference time per image: {}".format(timeit.time.perf_counter() - tic))
+
+        # always single image is passed at a time
+        prediction = predictions[0]
+
+        # reshape prediction (a BoxList) into the original image size
+        height, width = original_image.shape[-2:]
+        # if self.tensor_inputs:
+        # else:
+        #     height, width = original_image.shape[:-1]
+        prediction = prediction.resize((width, height))
+
+        if prediction.has_field("mask"):
+            # if we have masks, paste the masks in the right position
+            # in the image, as defined by the bounding boxes
+            masks = prediction.get_field("mask")
+            # always single image is passed at a time
+            masks = self.masker([masks], [prediction])[0]
+            prediction.add_field("mask", masks)
+
+        return prediction
+
+    @staticmethod
+    def to_left_right_upper_lower(bboxes):
+        return [(bbox[1], bbox[3], bbox[0], bbox[2]) for bbox in bboxes]
+
+    @staticmethod
+    def to_xmin_ymin_xmax_ymax(bboxes):
+        # invert the previous method
+        return [(bbox[2], bbox[0], bbox[3], bbox[1]) for bbox in bboxes]
+
+    @staticmethod
+    def prepare_image(image):
+        image = image[[2, 1, 0]]  # convert to bgr for opencv-format for glip
+        return image
+
+    @torch.no_grad()
+    def forward(self, image: torch.Tensor, obj: Union[str, list], return_labels: bool = False,
+                confidence_threshold=None):
+
+        if confidence_threshold is not None:
+            original_confidence_threshold = self.confidence_threshold
+            self.confidence_threshold = confidence_threshold
+
+        # if isinstance(object, list):
+        #     object = ' . '.join(object) + ' .' # add separation tokens
+        image = self.prepare_image(image)
+
+        # Avoid the resizing creating a huge image in a pathological case
+        ratio = image.shape[1] / image.shape[2]
+        ratio = max(ratio, 1 / ratio)
+        original_min_image_size = self.min_image_size
+        if ratio > 10:
+            self.min_image_size = int(original_min_image_size * 10 / ratio)
+            self.transforms = self.build_transform()
+
+        with torch.cuda.device(self.dev):
+            inference_output = self.inference(image, obj)
+
+        bboxes = inference_output.bbox.cpu().numpy().astype(int)
+        # bboxes = self.to_left_right_upper_lower(bboxes)
+
+        if ratio > 10:
+            self.min_image_size = original_min_image_size
+            self.transforms = self.build_transform()
+
+        bboxes = torch.tensor(bboxes)
+
+        # Convert to [left, lower, right, upper] instead of [left, upper, right, lower]
+        height = image.shape[-2]
+        bboxes = torch.stack([bboxes[:, 0], height - bboxes[:, 3], bboxes[:, 2], height - bboxes[:, 1]], dim=1)
+
+        if confidence_threshold is not None:
+            self.confidence_threshold = original_confidence_threshold
+        if return_labels:
+            # subtract 1 because it's 1-indexed for some reason
+            return bboxes, inference_output.get_field("labels").cpu().numpy() - 1
+        return bboxes
+    
+    def box_image(self,img,boxes,highlight_best=True):
+        img1 = img.copy()
+        draw = ImageDraw.Draw(img1)
+        for i,box in enumerate(boxes):
+            if i==0 and highlight_best:
+                color = 'red'
+            else:
+                color = 'blue'
+
+            draw.rectangle(box,outline=color,width=5)
+
+        return img1
+    
+    def parse(self,prog_step):
+        parse_result = parse_step(prog_step.prog_str)
+        step_name = parse_result['step_name']
+        img_var = parse_result['args']['image']
+        obj_name = eval(parse_result['args']['object'])
+        output_var = parse_result['output_var']
+        assert(step_name==self.step_name)
+        return img_var,obj_name,output_var
+    
+    def execute(self,prog_step,inspect=False):
+        img_var,obj_name,output_var = self.parse(prog_step)
+        img = prog_step.state[img_var]
+        
+        bboxes = self.forward(img,obj_name)
+
+        box_img = self.box_image(img, bboxes)
+
+        objs = dict(
+            box = bboxes,
+            category = obj_name,
+            img = img
+        )
+
+        prog_step.state[output_var] = objs
+        prog_step.state[output_var+'_IMAGE'] = box_img
+        if inspect:
+            html_str = self.html(img, box_img, output_var, obj_name)
+            return objs, html_str
+
+        return objs
 
 class LocInterpreter():
     step_name = 'LOC'
 
-    def __init__(self,thresh=0.1,nms_thresh=0.5):
+    def __init__(self, thresh=0.1,nms_thresh=0.5):
         print(f'Registering {self.step_name} step')
-        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
-        self.processor = OwlViTProcessor.from_pretrained(
-            "google/owlvit-large-patch14")
-        self.model = OwlViTForObjectDetection.from_pretrained(
-            "google/owlvit-large-patch14").to(self.device)
-        self.model.eval()
+
+        model_id = "IDEA-Research/grounding-dino-base"
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        self.processor = AutoProcessor.from_pretrained(model_id)
+        self.model = AutoModelForZeroShotObjectDetection.from_pretrained(
+            model_id).to(self.device)
+        
+        special_model_id = "facebook/detr-resnet-50"
+        self.special_processor = DetrImageProcessor.from_pretrained(
+            special_model_id, revision="no_timm")
+        self.special_model = DetrForObjectDetection.from_pretrained(
+            special_model_id, revision="no_timm")
+
         self.thresh = thresh
         self.nms_thresh = nms_thresh
 
@@ -225,19 +698,22 @@ def normalize_coord(self,bbox,img_size):
         return [x1,y1,x2,y2]
 
     def predict(self,img,obj_name):
-        encoding = self.processor(
-            text=[[f'a photo of {obj_name}']], 
-            images=img,
-            return_tensors='pt')
-        encoding = {k:v.to(self.device) for k,v in encoding.items()}
+        prompt = f"a {obj_name}."
+
+        inputs = self.processor(
+            images=img, text=prompt, return_tensors="pt").to(self.device)
+
         with torch.no_grad():
-            outputs = self.model(**encoding)
-            for k,v in outputs.items():
-                if v is not None:
-                    outputs[k] = v.to('cpu') if isinstance(v, torch.Tensor) else v
+            outputs = self.model(**inputs)
+
+        results = self.processor.post_process_grounded_object_detection(
+            outputs,
+            inputs.input_ids,
+            box_threshold=0.4,
+            text_threshold=0.3,
+            target_sizes=[img.size[::-1]]
+        )
         
-        target_sizes = torch.Tensor([img.size[::-1]])
-        results = self.processor.post_process_object_detection(outputs=outputs,threshold=self.thresh,target_sizes=target_sizes)
         boxes, scores = results[0]["boxes"], results[0]["scores"]
         boxes = boxes.cpu().detach().numpy().tolist()
         scores = scores.cpu().detach().numpy().tolist()
@@ -256,6 +732,36 @@ def predict(self,img,obj_name):
         selected_boxes, selected_scores = nms(
             selected_boxes,selected_scores,self.nms_thresh)
         return selected_boxes
+    
+    def special_predict(self, img):
+        inputs = self.special_processor(
+            images=img, return_tensors="pt").to(self.device)
+
+        with torch.no_grad():
+            outputs = self.special_model(**inputs)
+
+        target_sizes = torch.tensor([img.size[::-1]])
+        results = self.special_processor.post_process_object_detection(
+            outputs, target_sizes=target_sizes, threshold=0.9)[0]
+        
+        boxes, scores = results["boxes"], results["scores"]
+        boxes = boxes.tolist()
+        scores = scores.tolist()
+        if len(boxes)==0:
+            return []
+
+        boxes, scores = zip(*sorted(zip(boxes,scores),key=lambda x: x[1],reverse=True))
+        selected_boxes = []
+        selected_scores = []
+        for i in range(len(scores)):
+            if scores[i] > self.thresh:
+                coord = self.normalize_coord(boxes[i],img.size)
+                selected_boxes.append(coord)
+                selected_scores.append(scores[i])
+
+        selected_boxes, selected_scores = nms(
+            selected_boxes,selected_scores,self.nms_thresh)
+        return selected_boxes
 
     def top_box(self,img):
         w,h = img.size        
@@ -307,17 +813,26 @@ def execute(self,prog_step,inspect=False):
             bboxes = [self.left_box(img)]
         elif obj_name=='RIGHT':
             bboxes = [self.right_box(img)]
+        elif obj_name=='object':
+            bboxes = self.special_predict(img)
         else:
             bboxes = self.predict(img,obj_name)
 
         box_img = self.box_image(img, bboxes)
-        prog_step.state[output_var] = bboxes
+
+        objs = dict(
+            box = bboxes,
+            category = obj_name,
+            img = img
+        )
+
+        prog_step.state[output_var] = objs
         prog_step.state[output_var+'_IMAGE'] = box_img
         if inspect:
             html_str = self.html(img, box_img, output_var, obj_name)
-            return bboxes, html_str
+            return objs, html_str
 
-        return bboxes
+        return objs
 
 
 class Loc2Interpreter(LocInterpreter):
@@ -414,6 +929,12 @@ def html(self,img,out_img,output_var,box_img):
         step_name = html_step_name(self.step_name)
         box_arg = html_arg_name('bbox')
         return f"""<div>{output_var}={step_name}({box_arg}={box_img})={out_img}</div>"""
+    
+    def cropbox(self, box, img):
+        box = self.expand_box(box, img.size)
+        out_img = img.crop(box)
+
+        return out_img
 
     def execute(self,prog_step,inspect=False):
         img_var,box_var,output_var = self.parse(prog_step)
@@ -1028,7 +1549,7 @@ class ListInterpreter():
 
     def __init__(self):
         print(f'Registering {self.step_name} step')
-        openai.api_key = os.getenv("OPENAI_API_KEY")
+        self.client = OpenAI()
 
     def parse(self,prog_step):
         parse_result = parse_step(prog_step.prog_str)
@@ -1040,7 +1561,7 @@ def parse(self,prog_step):
         return text,list_max,output_var
 
     def get_list(self,text,list_max):
-        response = openai.Completion.create(
+        response = self.client.Completion.create(
             model="text-davinci-002",
             prompt=self.prompt_template.format(list_max=list_max,text=text),
             temperature=0.7,
@@ -1377,4 +1898,12 @@ def register_step_interpreters(dataset='nlvr'):
             RESULT=ResultInterpreter(),
             TAG=TagInterpreter(),
             LOC=Loc2Interpreter(thresh=0.05,nms_thresh=0.3)
+        )
+    elif dataset=='graph':
+        return dict(
+            LOC=GLIPLocInterpreter(),
+            BUILD=BuildInterpreter(),
+            ADD=ADDInterpreter(),
+            MERGE=MERGEInterpreter(),
+            RESULT=ResultInterpreter()
         )
\ No newline at end of file
diff --git a/engine/utils.py b/engine/utils.py
index f1bc03f..e6f90e1 100644
--- a/engine/utils.py
+++ b/engine/utils.py
@@ -3,6 +3,8 @@
 import openai
 import numpy as np
 import copy
+import re
+from FlagEmbedding import BGEM3FlagModel
 
 from .step_interpreters import register_step_interpreters, parse_step
 
@@ -49,6 +51,8 @@ def execute(self,prog,init_state,inspect=False):
 class ProgramGenerator():
     def __init__(self,prompter,temperature=0.7,top_p=0.5,prob_agg='mean'):
         openai.api_key = os.getenv("OPENAI_API_KEY")
+        # print("hello")
+        # print(os.getenv("OPENAI_API_KEY"))
         self.prompter = prompter
         self.temperature = temperature
         self.top_p = top_p
@@ -56,7 +60,7 @@ def __init__(self,prompter,temperature=0.7,top_p=0.5,prob_agg='mean'):
 
     def compute_prob(self,response):
         eos = '<|endoftext|>'
-        for i,token in enumerate(response.choices[0]['logprobs']['tokens']):
+        for i,token in enumerate(response.choices[0].logprobs):
             if token==eos:
                 break
 
@@ -68,22 +72,128 @@ def compute_prob(self,response):
             raise NotImplementedError
 
         return np.exp(agg_fn(
-            response.choices[0]['logprobs']['token_logprobs'][:i]))
+            response.choices[0].logprobs.token_logprobs[:i]))
 
     def generate(self,inputs):
-        response = openai.Completion.create(
-            model="text-davinci-003",
-            prompt=self.prompter(inputs),
+        response = openai.chat.completions.create(
+            model="gpt-4o",
+            messages=self.prompter.prompt(inputs),
             temperature=self.temperature,
             max_tokens=512,
             top_p=self.top_p,
             frequency_penalty=0,
             presence_penalty=0,
             n=1,
-            logprobs=1
+            logprobs=True
         )
 
-        prob = self.compute_prob(response)
-        prog = response.choices[0]['text'].lstrip('\n').rstrip('\n')
-        return prog, prob
+        # print(response.choices[0].logprobs)
+
+        # prob = self.compute_prob(response)
+        answer = response.choices[0].message.content
+
+        return self.prompter.parse(answer)
+
+class ProgramSynthesis:
+
+    def __init__(self):
+        self.word_close_model = BGEM3FlagModel('BAAI/bge-m3', 
+                                               use_fp16=True)
+
+    def parse_objectattribute(self, ObjAttri: str):
+
+        result = {}
+
+        pattern = r'(\w+):\s*([^:\n]+)'
+        matches = re.findall(pattern, ObjAttri)
+
+        # 解析匹配结果并填充字典
+        for match in matches:
+            key = match[0]
+            values = match[1].split(', ')
+            result[key] = values
+        
+        return result
+    
+    def parse_relation(self, Relation: str):
+        result = {}
+
+        pattern = r'\(([^)]+)\):\s*([^\n]+)'
+        matches = re.findall(pattern, Relation)
+
+        for match in matches:
+            objs = match[0].split(', ')
+            if(len(objs) == 2):
+                key = tuple(objs)
+                value = match[1].strip()
+                result[key] = value
+        
+        return result
+    
+    def find_close(self, word: str, obj_dict: dict):
+        candidates = list(obj_dict.keys())
+
+        embeddings_1 = self.word_close_model.encode([word], 
+                            batch_size=12, 
+                            max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
+                            )['dense_vecs']
+        embeddings_2 = self.word_close_model.encode(candidates)['dense_vecs']
+        similarity = embeddings_1 @ embeddings_2.T
+        arr = np.array(similarity[0])
+
+        return candidates[np.argmax(arr)]
+    
+    def synthesis(self, ObjAttri: str, Relation: str):
+
+        ObjAttriDict = self.parse_objectattribute(ObjAttri)
+        RelationDict = self.parse_relation(Relation)
+
+        print(ObjAttriDict)
+        print(RelationDict)
+
+        obj_var = {}
+        code = ""
+        index = 0
+        obj_idx = 0
+
+        for obj_name, attributes in ObjAttriDict.items():
+            code += f'OBJS{obj_idx}=LOC(image=IMAGE, object="{obj_name}")\n'
+            code += f'GRAPH{index}=BUILD(objects=OBJS{obj_idx})\n'
+
+            index += 1
+            obj_idx += 1
+            
+            for attri in attributes:
+                code += f'GRAPH{index}=ADD(graph=GRAPH{index-1}, attribute={attri})\n'
+                index += 1
+            
+            obj_var[obj_name] = index - 1
+
+        final_merge_start = index
+        
+        for (obja, objb), relation in RelationDict.items():
+            if obja not in obj_var:
+                obja = self.find_close(obja, obj_var)
+            if objb not in obj_var:
+                objb = self.find_close(objb, obj_var)
+
+            a_idx = obj_var[obja]
+            b_idx = obj_var[objb]
+
+            code += f'GRAPH{index}=MERGE(graphA=GRAPH{a_idx}, graphB=GRAPH{b_idx}, relation={relation})\n'
+
+            index += 1
+        
+        result_index = index - 1
+        
+        if(len(RelationDict) > 1):
+            for i in range(1, len(RelationDict)):
+                code += f'GRAPH{index}=MERGE(graphA=GRAPH{index-1}, graphB=GRAPH{final_merge_start+i}, relation={None})\n'
+                index += 1
+
+            result_index = index - 1
+
+        code += f"FINAL=RESULT(var=GRAPH{result_index})"
+
+        return code
     
\ No newline at end of file
diff --git a/notebooks/gqa.ipynb b/notebooks/gqa.ipynb
index 3beefac..2803b5a 100644
--- a/notebooks/gqa.ipynb
+++ b/notebooks/gqa.ipynb
@@ -1,56 +1,63 @@
 {
  "cells": [
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "import os\n",
-    "import sys\n",
-    "module_path = os.path.abspath(os.path.join('..'))\n",
-    "if module_path not in sys.path:\n",
-    "    sys.path.append(module_path)"
+    "> Note: First time run the code may take 15 minutes."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "%env OPENAI_API_KEY=<Enter you key here>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "env: OPENAI_API_KEY=you-key\n",
+      "Registering LOC step\n",
+      "Registering COUNT step\n",
+      "Registering CROP step\n",
+      "Registering CROP_RIGHTOF step\n",
+      "Registering CROP_LEFTOF step\n",
+      "Registering CROP_FRONTOF step\n",
+      "Registering CROP_INFRONTOF step\n",
+      "Registering CROP_INFRONT step\n",
+      "Registering CROP_BEHIND step\n",
+      "Registering CROP_AHEAD step\n",
+      "Registering CROP_BELOW step\n",
+      "Registering CROP_ABOVE step\n",
+      "Registering VQA step\n",
+      "Registering EVAL step\n",
+      "Registering RESULT step\n",
+      "hello\n",
+      "you-key\n"
+     ]
+    }
+   ],
    "source": [
+    "import os\n",
+    "import sys\n",
+    "module_path = os.path.abspath(os.path.join('..'))\n",
+    "if module_path not in sys.path:\n",
+    "    sys.path.append(module_path)\n",
+    "\n",
+    "# os.environ[\"http_proxy\"] = \"http://172.20.0.113:12798\"\n",
+    "# os.environ[\"https_proxy\"] = \"http://172.20.0.113:12798\"\n",
+    "\n",
+    "%env OPENAI_API_KEY=your-key\n",
+    "\n",
     "from PIL import Image\n",
     "from IPython.core.display import HTML\n",
     "from functools import partial\n",
     "\n",
     "from engine.utils import ProgramGenerator, ProgramInterpreter\n",
-    "from prompts.gqa import create_prompt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "interpreter = ProgramInterpreter(dataset='gqa')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "from prompts.gqa import create_prompt\n",
+    "\n",
+    "interpreter = ProgramInterpreter(dataset='gqa')\n",
+    "\n",
     "prompter = partial(create_prompt,method='all')\n",
     "generator = ProgramGenerator(prompter=prompter)"
    ]
@@ -61,7 +68,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "image = Image.open('../assets/camel1.png')\n",
+    "image = Image.open('../assets/COCO_train2014_000000269022.jpg')\n",
     "image.thumbnail((640,640),Image.Resampling.LANCZOS)\n",
     "init_state = dict(\n",
     "    IMAGE=image.convert('RGB')\n",
@@ -75,7 +82,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "question = \"How many people or animals are in the image?\"\n",
+    "question = \"Is the girl touching the ground?\"\n",
     "# question = \"Are there more animals than people in the image?\"\n",
     "# question = \"Localize the woman and tell me the color of her dress.\"\n",
     "# question = \"Find and tell me the name of the animal in the image.\"\n",
@@ -83,7 +90,7 @@
     "# question = \"How many women are to the left of the camel?\"\n",
     "# question = \"Is the lamp to the left of the woman lit?\"\n",
     "# question = \"Is there a sun in the sky?\"\n",
-    "prog,_ = generator.generate(dict(question=question))\n",
+    "prog = generator.generate(question)\n",
     "print(prog)"
    ]
   },
@@ -131,7 +138,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.10"
+   "version": "3.10.14"
   },
   "orig_nbformat": 4
  },
diff --git a/notebooks/text b/notebooks/text
new file mode 100644
index 0000000..7a487fa
--- /dev/null
+++ b/notebooks/text
@@ -0,0 +1 @@
+ChoiceLogprobs(content=[ChatCompletionTokenLogprob(token='BOX', bytes=[66, 79, 88], logprob=-0.0009105099, top_logprobs=[]), ChatCompletionTokenLogprob(token='0', bytes=[48], logprob=-8.7212284e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-2.188868e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='LOC', bytes=[76, 79, 67], logprob=-9.4914985e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='(image', bytes=[40, 105, 109, 97, 103, 101], logprob=-0.000110457004, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-6.456359e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='IMAGE', bytes=[73, 77, 65, 71, 69], logprob=-9.849109e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token=',', bytes=[44], logprob=-2.319992e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='object', bytes=[111, 98, 106, 101, 99, 116], logprob=-0.0012027314, top_logprobs=[]), ChatCompletionTokenLogprob(token="='", bytes=[61, 39], logprob=-2.7848862e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='person', bytes=[112, 101, 114, 115, 111, 110], logprob=-0.2478368, top_logprobs=[]), ChatCompletionTokenLogprob(token="')\n", bytes=[39, 41, 10], logprob=-5.6576944e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='BOX', bytes=[66, 79, 88], logprob=-0.00048131612, top_logprobs=[]), ChatCompletionTokenLogprob(token='1', bytes=[49], logprob=-2.6418418e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-5.157039e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='LOC', bytes=[76, 79, 67], logprob=-9.968313e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='(image', bytes=[40, 105, 109, 97, 103, 101], logprob=-7.505351e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-3.2736214e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='IMAGE', bytes=[73, 77, 65, 71, 69], logprob=-2.1650272e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token=',', bytes=[44], logprob=-4.465658e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='object', bytes=[111, 98, 106, 101, 99, 116], logprob=-8.017927e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token="='", bytes=[61, 39], logprob=-1.1756368e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='animal', bytes=[97, 110, 105, 109, 97, 108], logprob=-0.004169222, top_logprobs=[]), ChatCompletionTokenLogprob(token="')\n", bytes=[39, 41, 10], logprob=-0.00016802136, top_logprobs=[]), ChatCompletionTokenLogprob(token='ANS', bytes=[65, 78, 83], logprob=-0.0016503064, top_logprobs=[]), ChatCompletionTokenLogprob(token='WER', bytes=[87, 69, 82], logprob=-0.000102827966, top_logprobs=[]), ChatCompletionTokenLogprob(token='0', bytes=[48], logprob=-1.92662e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-2.4584822e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='COUNT', bytes=[67, 79, 85, 78, 84], logprob=-5.3596854e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='(box', bytes=[40, 98, 111, 120], logprob=-8.9834764e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-2.188868e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='BOX', bytes=[66, 79, 88], logprob=-6.511407e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='0', bytes=[48], logprob=-5.931863e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token=')\n', bytes=[41, 10], logprob=-0.003594599, top_logprobs=[]), ChatCompletionTokenLogprob(token='ANS', bytes=[65, 78, 83], logprob=-7.58424e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='WER', bytes=[87, 69, 82], logprob=-1.1041146e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='1', bytes=[49], logprob=-2.6895234e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-1.27099975e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='COUNT', bytes=[67, 79, 85, 78, 84], logprob=-2.4584822e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='(box', bytes=[40, 98, 111, 120], logprob=-5.276243e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-4.012684e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='BOX', bytes=[66, 79, 88], logprob=-7.111979e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='1', bytes=[49], logprob=-0.00015812746, top_logprobs=[]), ChatCompletionTokenLogprob(token=')\n', bytes=[41, 10], logprob=-5.347765e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='ANS', bytes=[65, 78, 83], logprob=-0.040827427, top_logprobs=[]), ChatCompletionTokenLogprob(token='WER', bytes=[87, 69, 82], logprob=-2.7133641e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='2', bytes=[50], logprob=-0.012698344, top_logprobs=[]), ChatCompletionTokenLogprob(token='=E', bytes=[61, 69], logprob=-0.020761896, top_logprobs=[]), ChatCompletionTokenLogprob(token='VAL', bytes=[86, 65, 76], logprob=0.0, top_logprobs=[]), ChatCompletionTokenLogprob(token='(expr', bytes=[40, 101, 120, 112, 114], logprob=-0.00011880126, top_logprobs=[]), ChatCompletionTokenLogprob(token='="{', bytes=[61, 34, 123], logprob=-0.0070076943, top_logprobs=[]), ChatCompletionTokenLogprob(token='ANS', bytes=[65, 78, 83], logprob=-6.392203e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='WER', bytes=[87, 69, 82], logprob=-5.4385737e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='0', bytes=[48], logprob=-1.1994775e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='}', bytes=[125], logprob=-7.3458323e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token=' +', bytes=[32, 43], logprob=-0.014847175, top_logprobs=[]), ChatCompletionTokenLogprob(token=' {', bytes=[32, 123], logprob=-2.2723105e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='ANS', bytes=[65, 78, 83], logprob=-5.6769813e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='WER', bytes=[87, 69, 82], logprob=-8.537869e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='1', bytes=[49], logprob=-1.4974867e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='}")\n', bytes=[125, 34, 41, 10], logprob=-3.88156e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='FINAL', bytes=[70, 73, 78, 65, 76], logprob=-3.070975e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='_RESULT', bytes=[95, 82, 69, 83, 85, 76, 84], logprob=-2.2842309e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-2.6299214e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='RESULT', bytes=[82, 69, 83, 85, 76, 84], logprob=-7.107425e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='(var', bytes=[40, 118, 97, 114], logprob=-7.58424e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-4.8425554e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='ANS', bytes=[65, 78, 83], logprob=-3.0545007e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='WER', bytes=[87, 69, 82], logprob=-2.5226382e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='2', bytes=[50], logprob=-0.0005008472, top_logprobs=[]), ChatCompletionTokenLogprob(token=')', bytes=[41], logprob=-1.0280384e-06, top_logprobs=[])])
diff --git a/prompts/gqa.py b/prompts/gqa.py
index 1b572fa..1307708 100644
--- a/prompts/gqa.py
+++ b/prompts/gqa.py
@@ -1,53 +1,145 @@
 import random
 
-GQA_CURATED_EXAMPLES=[
-"""Question: Is the vehicle in the top of the image?
+GQA_CURATED_MESSAGES=[
+{
+"role": "system",
+"content": """You are a visual programmer, you need to generate corresponding code according to user's question(without picture).
+Here are some visual functions:
+- LOC(image=, object=): it can locate the specific region of the image and return the region box, which is able to detect object.
+- CROP(image=, box=): it can crop the image, fitting into the region box.
+- COUNT(box=): it can count the number of region boxes.
+- CROP_RIGHTOF(image=,box=): it can crop the image, leaving the right area of the region box.
+- CROP_LEFTOF(image=,box=): it can crop the image, leaving the left area of the region box.
+- CROP_FRONTOF(image=,box=): it can crop the image, leaving the area in front of the region box.
+- CROP_INFRONTOF(image=,box=): it can crop the image, leaving the area in the front of the region box.
+- CROP_INFRONT(image=,box=): it can crop the image, leaving the area in front the region box.
+- CROP_BEHIND(image=,box=): it can crop the image, leaving the area behind the region box.
+- CROP_AHEAD(image=,box=): it can crop the image, leaving the area head of the region box.
+- CROP_BELOW(image=,box=): it can crop the image, leaving the area below the region box.
+- CROP_ABOVE(image=,box=): it can crop the image, leaving the area above the region box.
+- VQA(image=,question=): it can answer the question according to the image.
+- RESULT(var=): it can use a variable as the final result.
+
+REMEMBER: You are not allowed to generate no program. Try to handle with the question step by step.
+"""
+},
+{
+"role": "user",
+"content": "Question: Is the vehicle in the top of the image?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. I need to locate the top area.
+2. Find vehicles.
+3. Count the number of vehicles.
+4. If the number of vehicles is greater than 0, say "yes".
+5. Otherwise, say "no".
 Program:
 BOX0=LOC(image=IMAGE,object='TOP')
 IMAGE0=CROP(image=IMAGE,box=BOX0)
 BOX1=LOC(image=IMAGE0,object='vehicle')
 ANSWER0=COUNT(box=BOX1)
 ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 else 'no'")
-FINAL_RESULT=RESULT(var=ANSWER1)
-""",
-"""Question: Are there trains or fences in this scene?
+FINAL_RESULT=RESULT(var=ANSWER1)"""
+},
+{
+"role": "user",
+"content": "Question: Are there trains or fences in this scene?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. I need to find the trains.
+2. I need to find the fences.
+3. Count the number of the trains.
+4. Count the number of the fences.
+5. Plus the numbers of the trains and the fences.
+6. If the sum is greater than 0, say "yes". Otherwise, "no".
 Program:
 BOX0=LOC(image=IMAGE,object='train')
 BOX1=LOC(image=IMAGE,object='fence')
 ANSWER0=COUNT(box=BOX0)
 ANSWER1=COUNT(box=BOX1)
 ANSWER2=EVAL(expr="'yes' if {ANSWER0} + {ANSWER1} > 0 else 'no'")
-FINAL_RESULT=RESULT(var=ANSWER)
-""",
-"""Question: Who is carrying the umbrella?
+FINAL_RESULT=RESULT(var=ANSWER)"""
+},
+{
+"role": "user",
+"content": "Question: Who is carrying the umbrella?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. I need to find the umbrella.
+2. Check out the area of the umbrella.
+3. Ask who is carrying the umbrella.
 Program:
 BOX0=LOC(image=IMAGE,object='umbrella')
 IMAGE0=CROP(image=IMAGE,box=BOX0)
 ANSWER0=VQA(image=IMAGE0,question='Who is carrying the umbrella?')
-FINAL_RESULT=RESULT(var=ANSWER0)
-""",
-"""Question: Which place is it?
+FINAL_RESULT=RESULT(var=ANSWER0)"""
+},
+{
+"role": "user",
+"content": "Question: Which place is it?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. I don't have any object informations. So I decide to simply ask.
 Program:
 ANSWER0=VQA(image=IMAGE,question='Which place is it?')
-FINAL_RESULT=RESULT(var=ANSWER0)
-""",
-"""Question: What color is the curtain that is to the right of the mirror?
+FINAL_RESULT=RESULT(var=ANSWER0)"""
+},
+{
+"role": "user",
+"content": "Question: What color is the curtain that is to the right of the mirror?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. I need to find the mirror.
+2. Locate the right area of the mirror.
+3. Ask the color of the curtain.
+4. Mark the color as the answer.
 Program:
 BOX0=LOC(image=IMAGE,object='mirror')
 IMAGE0=CROP_RIGHTOF(image=IMAGE,box=BOX0)
 ANSWER0=VQA(image=IMAGE0,question='What color is the curtain?')
-FINAL_RESULT=RESULT(var=ANSWER0)
-""",
-"""Question: Is the pillow in the top part or in the bottom of the picture?
+FINAL_RESULT=RESULT(var=ANSWER0)"""
+},
+{
+"role": "user",
+"content": "Question: Is the pillow in the top part or in the bottom of the picture?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. I decide to check out the top of the picture.
+2. Is there any pillows?
+3. If the number of pillows is greater than 0, say "top".
+4. Otherwise, say "bottom".
 Program:
 BOX0=LOC(image=IMAGE,object='TOP')
 IMAGE0=CROP(image=IMAGE,box=BOX0)
 BOX1=LOC(image=IMAGE0,object='pillow')
 ANSWER0=COUNT(box=BOX1)
 ANSWER1=EVAL(expr="'top' if {ANSWER0} > 0 else 'bottom'")
-FINAL_RESULT=RESULT(var=ANSWER1)
-""",
-"""Question: Question: Do you see bottles to the right of the wine on the left of the picture?
+FINAL_RESULT=RESULT(var=ANSWER1)"""
+},
+{
+"role": "user",
+"content": "Question: Do you see bottles to the right of the wine on the left of the picture?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. I need to locate the left of the picture.
+2. I need to find the wine on the left of the picture.
+3. Check out the right area of the wine on the left of the picture.
+4. Is there any bottles?
+5. If the number of bottles is greater than 0, say "yes". Otherwise, say "no".
 Program:
 BOX0=LOC(image=IMAGE,object='LEFT')
 IMAGE0=CROP(image=IMAGE,box=BOX0)
@@ -56,32 +148,71 @@
 BOX2=LOC(image=IMAGE1,object='bottles')
 ANSWER0=COUNT(box=BOX2)
 ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 else 'no'")
-FINAL_RESULT=RESULT(var=ANSWER1)
-""",
-"""Question: Is the street light standing behind a truck?
+FINAL_RESULT=RESULT(var=ANSWER1)"""
+},
+{
+"role": "user",
+"content": "Question: Is the street light standing behind a truck?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. I need to find the truck.
+2. Check out the behind of the truck.
+3. Count the number of the truck.
+4. If the number of the truck is greater than 0, say "yes". Otherwise, say "no".
 Program:
 BOX0=LOC(image=IMAGE,object='truck')
 IMAGE0=CROP_BEHIND(image=IMAGE,box=BOX0)
 BOX1=LOC(image=IMAGE0,object='street light')
 ANSWER0=COUNT(box=BOX1)
 ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 else 'no'")
-FINAL_RESULT=RESULT(var=ANSWER1)
-""",
-"""Question: Which side is the food on?
+FINAL_RESULT=RESULT(var=ANSWER1)"""
+},
+{
+"role": "user",
+"content": "Question: Which side is the food on?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. I decide to check the right side first.
+2. Check out is there any food on the right side.
+3. If the number of foof on the right side is greater than zero, say "right".
+4. Otherwise, "left".
 Program:
 BOX0=LOC(image=IMAGE,object='RIGHT')
 IMAGE0=CROP(image=IMAGE,box=BOX0)
 BOX1=LOC(image=IMAGE0,object='food')
 ANSWER0=COUNT(box=BOX1)
 ANSWER1=EVAL(expr="'right' if {ANSWER0} > 0 else 'left'")
-FINAL_RESULT=RESULT(var=ANSWER1)
-""",
-"""Question: What do the wetsuit and the sky have in common?
+FINAL_RESULT=RESULT(var=ANSWER1)"""
+},
+{
+"role": "user",
+"content": "Question: What do the wetsuit and the sky have in common?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. I have no idea of the common attribute, so I decide to simply ask.
 Program:
 ANSWER0=VQA(image=IMAGE,question='What do the wetsuit and the sky have in common?')
-FINAL_RESULT=RESULT(var=ANSWER0)
-""",
-"""Question: Do the post and the sign have a different colors?
+FINAL_RESULT=RESULT(var=ANSWER0)"""
+},
+{
+"role": "user",
+"content": "Question: Do the post and the sign have a different colors?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. I need to find the post.
+2. I need to find the sign.
+3. I need to ask the color of the post.
+4. I need to ask the color of the sign.
+5. Check the colors between post and sign.
+6. If the colors are the same, say "yes". Otherwise, "no".
 Program:
 BOX0=LOC(image=IMAGE,object='post')
 IMAGE0=CROP(image=IMAGE,box=BOX0)
@@ -90,40 +221,86 @@
 ANSWER0=VQA(image=IMAGE0,question='What color is the post?')
 ANSWER1=VQA(image=IMAGE1,question='What color is the sign?')
 ANSWER2=EVAL(expr="'yes' if {ANSWER0} != {ANSWER1} else 'no'")
-FINAL_RESULT=RESULT(var=ANSWER2)
-""",
-"""Question: Does the traffic cone have white color?
+FINAL_RESULT=RESULT(var=ANSWER2)"""
+},
+{
+"role": "user",
+"content": "Question: Does the traffic cone have white color?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. I need to find the traffic cone.
+2. I need to ask the color of the traffic cone.
+3. If the color of the traffic cone is white, I say "yes". Otherwise, "no".
 Program:
 BOX0=LOC(image=IMAGE,object='traffic cone')
 IMAGE0=CROP(image=IMAGE,box=BOX0)
 ANSWER0=VQA(image=IMAGE0,question='What color is the traffic cone?')
 ANSWER1=EVAL(expr="'yes' if {ANSWER0} == 'white' else 'no'")
-FINAL_RESULT=RESULT(var=ANSWER1)
-""",
-"""Question: Are these animals of different species?
+FINAL_RESULT=RESULT(var=ANSWER1)"""
+},
+{
+"role": "user",
+"content": "Question: Are these animals of different species?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. I can't check every animal, so I decide to simply ask.
 Program:
 ANSWER0=VQA(image=IMAGE,question='Are these animals of different species?')
-FINAL_RESULT=RESULT(var=ANSWER0)
-""",
-"""Question: Which side of the image is the chair on?
+FINAL_RESULT=RESULT(var=ANSWER0)"""
+},
+{
+"role": "user",
+"content": "Question: Which side of the image is the chair on?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. First, I check the right side of the image.
+2. Is there any chair?
+3. If there is a chair, I say "right". Otherwise, "left".
 Program:
 BOX0=LOC(image=IMAGE,object='RIGHT')
 IMAGE0=CROP(image=IMAGE,box=BOX0)
 BOX1=LOC(image=IMAGE0,object='chair')
 ANSWER0=COUNT(box=BOX1)
 ANSWER1=EVAL(expr="'right' if {ANSWER0} > 0 else 'left'")
-FINAL_RESULT=RESULT(var=ANSWER1)
-""",
-"""Question: Do you see any drawers to the left of the plate?
+FINAL_RESULT=RESULT(var=ANSWER1)"""
+},
+{
+"role": "user",
+"content": "Question: Do you see any drawers to the left of the plate?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. I need to find the plate.
+2. I need to look the left of the plate.
+3. Is there any drawers?
+4. If the number of drawers is greater than 0, I say "yes". Otherwise, "no".
 Program:
 BOX0=LOC(image=IMAGE,object='plate')
 IMAGE0=CROP_LEFTOF(image=IMAGE,box=BOX0)
 BOX1=LOC(image=IMAGE0,object='drawers')
 ANSWER0=COUNT(box=BOX1)
 ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 else 'no'")
-FINAL_RESULT=RESULT(var=ANSWER1)
-""",
-"""Question: Does the mat have the same color as the sky?
+FINAL_RESULT=RESULT(var=ANSWER1)"""
+},
+{
+"role": "user",
+"content": "Question: Does the mat have the same color as the sky?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. I need to find the sky.
+2. I need to find the mat.
+3. I need to know the sky's color.
+4. I need to know the mat's color.
+5. If the two colors are same, I say "yes". Otherwise, "no".
 Program:
 BOX0=LOC(image=IMAGE,object='sky')
 IMAGE0=CROP(image=IMAGE,box=BOX0)
@@ -132,57 +309,45 @@
 ANSWER0=VQA(image=IMAGE0,question='What color is the sky?')
 ANSWER1=VQA(image=IMAGE1,question='What color is the mat?')
 ANSWER2=EVAL(expr="'yes' if {ANSWER0} == {ANSWER1} else 'no'")
-FINAL_RESULT=RESULT(var=ANSWER2)
-""",
-"""Question: Is a cat above the mat?
+FINAL_RESULT=RESULT(var=ANSWER2)"""
+},
+{
+"role": "user",
+"content": "Question: Is a cat above the mat?"
+},
+{
+"role": "assistant",
+"content": """Reasoning:
+1. I need to find the mat.
+2. Look above the mat and see if there is a cat.
+3. If there is a cat, I say "yes". Otherwise, "no".
 Program:
 BOX0=LOC(image=IMAGE,object='mat')
 IMAGE0=CROP_ABOVE(image=IMAGE,box=BOX0)
 BOX1=LOC(image=IMAGE0,object='cat')
 ANSWER0=COUNT(box=BOX1)
 ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 else 'no'")
-FINAL_RESULT=RESULT(var=ANSWER1)
-"""
-"""Question: Is the cat above a mat?
-Program:
-BOX0=LOC(image=IMAGE,object='cat')
-IMAGE0=CROP_BELOW(image=IMAGE,box=BOX0)
-BOX1=LOC(image=IMAGE0,object='mat')
-ANSWER0=COUNT(box=BOX1)
-ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 and else 'no'")
-FINAL_RESULT=RESULT(var=ANSWER1)
-""",
-"""Question: Is the mat below a cat?
-Program:
-BOX0=LOC(image=IMAGE,object='mat')
-IMAGE0=CROP_ABOVE(image=IMAGE,box=BOX0)
-BOX1=LOC(image=IMAGE0,object='cat')
-ANSWER0=COUNT(box=BOX1)
-ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 else 'no'")
-FINAL_RESULT=RESULT(var=ANSWER1)
-""",
-"""Question: Is a mat below the cat?
-Program:
-BOX0=LOC(image=IMAGE,object='cat')
-IMAGE0=CROP_BELOW(image=IMAGE,box=BOX0)
-BOX1=LOC(image=IMAGE0,object='mat')
-ANSWER0=COUNT(box=BOX1)
-ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 and else 'no'")
-FINAL_RESULT=RESULT(var=ANSWER1)
-""",
+FINAL_RESULT=RESULT(var=ANSWER1)"""
+},
 ]
 
 def create_prompt(inputs,num_prompts=8,method='random',seed=42,group=0):
-    if method=='all':
-        prompt_examples = GQA_CURATED_EXAMPLES
-    elif method=='random':
-        random.seed(seed)
-        prompt_examples = random.sample(GQA_CURATED_EXAMPLES,num_prompts)
-    else:
-        raise NotImplementedError
+    # if method=='all':
+    #     prompt_examples = GQA_CURATED_EXAMPLES
+    # elif method=='random':
+    #     random.seed(seed)
+    #     prompt_examples = random.sample(GQA_CURATED_EXAMPLES,num_prompts)
+    # else:
+    #     raise NotImplementedError
+
+    
+
+    # prompt_examples = '\n'.join(prompt_examples)
+    # prompt_examples = f'Think step by step to answer the question.\n\n{prompt_examples}'
 
-    prompt_examples = '\n'.join(prompt_examples)
-    prompt_examples = f'Think step by step to answer the question.\n\n{prompt_examples}'
+    prompt_examples = GQA_CURATED_MESSAGES
+    prompt_examples.append({"role":"user", "content":f"Question: {inputs}"})
 
+    # print(prompt_examples[35]["content"])
 
-    return prompt_examples + "\nQuestion: {question}\nProgram:".format(**inputs)
\ No newline at end of file
+    return prompt_examples
\ No newline at end of file
diff --git a/prompts/object_attribute.py b/prompts/object_attribute.py
new file mode 100644
index 0000000..ad971ea
--- /dev/null
+++ b/prompts/object_attribute.py
@@ -0,0 +1,102 @@
+import re
+from .prompter import Prompter
+
+class ObjectAttributePrompter(Prompter):
+
+    MESSAGE = [
+    {
+    "role": "system",
+    "content": """Given a question, identify the entities mentioned in the question and list their corresponding attributes. Each entity should be on a new line, and each attribute should be separated by a space. Here is an example:
+
+Sure! Here’s a GPT-3.5 prompt that asks for an analysis of entities and their corresponding attributes from a given question:
+
+---
+
+**Prompt:**
+
+Given a question, identify the entities mentioned in the question and list their corresponding attributes. Each entity should be on a new line, and each attribute should be separated by a space. Here is an example:
+
+**Question:** "Which kind of clothing is not black?"
+
+**Answer:**
+
+```
+clothing: color
+```
+
+Make sure to maintain the format and accuracy in identifying entities and their attributes for each question.
+"""
+    },
+    {
+    "role": "user",
+    "content": "**Question**: Who is carrying the umbrella?"
+    },
+    {
+    "role": "assistant",
+    "content": """**Answer:**
+
+```
+people: position
+umbrella: position
+```
+"""
+    },
+    {
+    "role": "user",
+    "content": "**Question**: Which place is it?"
+    },
+    {
+    "role": "assistant",
+    "content": """**Answer:**
+
+```
+object: usage, scene
+```
+"""
+    },
+    {
+    "role": "user",
+    "content": "**Question**: Does the clothing look large?"
+    },
+    {
+    "role": "assistant",
+    "content": """**Answer:**
+
+```
+clothing: size
+```
+"""
+    },
+    {
+    "role": "user",
+    "content": "**Question**: Is the chair warm?"
+    },
+    {
+    "role": "assistant",
+    "content": """**Answer:**
+
+```
+chair: feeling(warm or cold)
+```
+"""
+    },
+    ]
+
+    def prompt(self, input):
+
+        prompt_examples = self.MESSAGE
+        prompt_examples.append({"role":"user", "content":f"**Question**: {input}"})
+
+        # print(prompt_examples[35]["content"])
+
+        return prompt_examples
+    
+    def parse(self, output):
+        matches = re.findall(r'```([^}]*)```', output)
+
+        if len(matches) > 0:
+            return matches[0]
+        
+        return ""
+
+        
\ No newline at end of file
diff --git a/prompts/prompter.py b/prompts/prompter.py
new file mode 100644
index 0000000..e607f36
--- /dev/null
+++ b/prompts/prompter.py
@@ -0,0 +1,12 @@
+class Prompter:
+
+    MESSAGE = []
+
+    def __init__(self):
+        pass
+
+    def prompt(self, input):
+        return input
+    
+    def parse(self, output):
+        return output
\ No newline at end of file
diff --git a/prompts/relation.py b/prompts/relation.py
new file mode 100644
index 0000000..2ef1353
--- /dev/null
+++ b/prompts/relation.py
@@ -0,0 +1,85 @@
+import re
+from .prompter import Prompter
+
+class RelationPrompter(Prompter):
+    MESSAGE = [
+    {
+    "role": "system",
+    "content": """Given a question and a list of objects along with their attributes, identify and describe the relationship between the objects mentioned in the question.
+
+For example, if the question is "Who is carrying the umbrella?" and the objects and attributes are:
+
+```
+people: position
+umbrella: position
+```
+
+The output should be:
+
+```
+(people, umbrella): carry
+```
+
+This indicates that there is a "carry" relationship between "people" and "umbrella".
+
+**Input:**
+
+1. A question.
+2. A list of objects with their associated attributes.
+
+**Output:**
+
+The relationship between the objects mentioned in the question, formatted as (object1, object2): relationship.
+    """
+    },
+    {
+    "role": "user",
+    "content": """**Question:** "Who is driving the car?"
+**Objects and attributes:**
+```
+person: role
+car: object
+```"""
+    },
+    {
+    "role": "assistant",
+    "content": """**Output:**
+```
+(person, car): drive
+```"""
+    },
+    {
+    "role": "user",
+    "content": """**Question:** "What animal is eating the grass?"
+**Objects and attributes:**
+```
+animal: species
+grass: object
+    ```"""
+    },
+    {
+    "role": "assistant",
+    "content": """**Output:**
+```
+(animal, grass): eat
+```"""
+    },
+    ]
+
+    def prompt(self, input):
+
+        prompt_examples = self.MESSAGE
+        Question = input["Question"]
+        ObjectAttribute = input["ObjectAttribute"]
+        prompt_examples.append({"role":"user", "content":f"**Question**: {Question}\n**Objects and attributes:**\n```\n{ObjectAttribute}\n```"})
+
+        return prompt_examples
+
+    def parse(self, output):
+        matches = re.findall(r'```([^}]*)```', output)
+
+        if len(matches) > 0:
+            return matches[0]
+        
+        return ""
+    
\ No newline at end of file